3 # ljdump.py - livejournal archiver
4 # Greg Hewgill <greg@hewgill.com> http://hewgill.com
9 # This software is provided 'as-is', without any express or implied
10 # warranty. In no event will the author be held liable for any damages
11 # arising from the use of this software.
13 # Permission is granted to anyone to use this software for any purpose,
14 # including commercial applications, and to alter it and redistribute it
15 # freely, subject to the following restrictions:
17 # 1. The origin of this software must not be misrepresented; you must not
18 # claim that you wrote the original software. If you use this software
19 # in a product, an acknowledgment in the product documentation would be
20 # appreciated but is not required.
21 # 2. Altered source versions must be plainly marked as such, and must not be
22 # misrepresented as being the original software.
23 # 3. This notice may not be removed or altered from any source distribution.
25 # Copyright (c) 2005-2010 Greg Hewgill and contributors
27 import codecs, os, pickle, pprint, re, shutil, sys, urllib2, xml.dom.minidom, xmlrpclib
28 from xml.sax import saxutils
37 from hashlib import md5
42 def calcchallenge(challenge, password):
43 return md5(challenge+md5(password).hexdigest()).hexdigest()
45 def flatresponse(response):
48 name = response.readline()
52 name = name[:len(name)-1]
53 value = response.readline()
55 value = value[:len(value)-1]
59 def getljsession(server, username, password):
60 r = urllib2.urlopen(server+"/interface/flat", "mode=getchallenge")
61 response = flatresponse(r)
63 r = urllib2.urlopen(server+"/interface/flat", "mode=sessiongenerate&user=%s&auth_method=challenge&auth_challenge=%s&auth_response=%s" % (username, response['challenge'], calcchallenge(response['challenge'], password)))
64 response = flatresponse(r)
66 return response['ljsession']
68 def dochallenge(server, params, password):
69 challenge = server.LJ.XMLRPC.getchallenge()
71 'auth_method': "challenge",
72 'auth_challenge': challenge['challenge'],
73 'auth_response': calcchallenge(challenge['challenge'], password)
77 def dumpelement(f, name, e):
78 f.write("<%s>\n" % name)
80 if isinstance(e[k], {}.__class__):
81 dumpelement(f, k, e[k])
84 s = unicode(str(e[k]), "UTF-8")
85 except UnicodeDecodeError:
86 # fall back to Latin-1 for old entries that aren't UTF-8
87 s = unicode(str(e[k]), "cp1252")
88 f.write("<%s>%s</%s>\n" % (k, saxutils.escape(s), k))
89 f.write("</%s>\n" % name)
91 def writedump(fn, event):
92 f = codecs.open(fn, "w", "UTF-8")
93 f.write("""<?xml version="1.0"?>\n""")
94 dumpelement(f, "event", event)
97 def writelast(journal, lastsync, lastmaxid):
98 f = open("%s/.last" % journal, "w")
99 f.write("%s\n" % lastsync)
100 f.write("%s\n" % lastmaxid)
103 def createxml(doc, name, map):
104 e = doc.createElement(name)
106 me = doc.createElement(k)
107 me.appendChild(doc.createTextNode(map[k]))
114 return e[0].firstChild.nodeValue
116 def ljdump(Server, Username, Password, Journal):
117 m = re.search("(.*)/interface/xmlrpc", Server)
120 if Username != Journal:
121 authas = "&authas=%s" % Journal
125 print "Fetching journal entries for: %s" % Journal
128 print "Created subdirectory: %s" % Journal
132 ljsession = getljsession(Server, Username, Password)
134 server = xmlrpclib.ServerProxy(Server+"/interface/xmlrpc")
143 f = open("%s/.last" % Journal, "r")
144 lastsync = f.readline()
145 if lastsync[-1] == '\n':
146 lastsync = lastsync[:len(lastsync)-1]
147 lastmaxid = f.readline()
148 if len(lastmaxid) > 0 and lastmaxid[-1] == '\n':
149 lastmaxid = lastmaxid[:len(lastmaxid)-1]
153 lastmaxid = int(lastmaxid)
157 origlastsync = lastsync
159 r = server.LJ.XMLRPC.login(dochallenge(server, {
160 'username': Username,
165 userpics = dict(zip(map(str, r['pickws']), r['pickwurls']))
166 if r['defaultpicurl']:
167 userpics['*'] = r['defaultpicurl']
170 r = server.LJ.XMLRPC.syncitems(dochallenge(server, {
171 'username': Username,
173 'lastsync': lastsync,
174 'usejournal': Journal,
177 if len(r['syncitems']) == 0:
179 for item in r['syncitems']:
180 if item['item'][0] == 'L':
181 print "Fetching journal entry %s (%s)" % (item['item'], item['action'])
183 e = server.LJ.XMLRPC.getevents(dochallenge(server, {
184 'username': Username,
187 'itemid': item['item'][2:],
188 'usejournal': Journal,
191 writedump("%s/%s" % (Journal, item['item']), e['events'][0])
194 print "Unexpected empty item: %s" % item['item']
196 except xmlrpclib.Fault, x:
197 print "Error getting item: %s" % item['item']
200 lastsync = item['time']
201 writelast(Journal, lastsync, lastmaxid)
203 # The following code doesn't work because the server rejects our repeated calls.
204 # http://www.livejournal.com/doc/server/ljp.csp.xml-rpc.getevents.html
205 # contains the statement "You should use the syncitems selecttype in
206 # conjuntions [sic] with the syncitems protocol mode", but provides
207 # no other explanation about how these two function calls should
208 # interact. Therefore we just do the above slow one-at-a-time method.
211 # r = server.LJ.XMLRPC.getevents(dochallenge(server, {
212 # 'username': Username,
214 # 'selecttype': "syncitems",
215 # 'lastsync': lastsync,
218 # if len(r['events']) == 0:
220 # for item in r['events']:
221 # writedump("%s/L-%d" % (Journal, item['itemid']), item)
223 # lastsync = item['eventtime']
225 print "Fetching journal comments for: %s" % Journal
228 f = open("%s/comment.meta" % Journal)
229 metacache = pickle.load(f)
235 f = open("%s/user.map" % Journal)
236 usermap = pickle.load(f)
245 r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_meta&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
246 meta = xml.dom.minidom.parse(r)
248 print "*** Error fetching comment meta, possibly not community maintainer?"
254 except AttributeError: # r is sometimes a dict for unknown reasons
256 for c in meta.getElementsByTagName("comment"):
257 id = int(c.getAttribute("id"))
259 'posterid': c.getAttribute("posterid"),
260 'state': c.getAttribute("state"),
264 for u in meta.getElementsByTagName("usermap"):
265 usermap[u.getAttribute("id")] = u.getAttribute("user")
266 if maxid >= int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue):
269 f = open("%s/comment.meta" % Journal, "w")
270 pickle.dump(metacache, f)
273 f = open("%s/user.map" % Journal, "w")
274 pickle.dump(usermap, f)
282 r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_body&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
283 meta = xml.dom.minidom.parse(r)
285 print "*** Error fetching comment body, possibly not community maintainer?"
290 for c in meta.getElementsByTagName("comment"):
291 id = int(c.getAttribute("id"))
292 jitemid = c.getAttribute("jitemid")
295 'parentid': c.getAttribute("parentid"),
296 'subject': gettext(c.getElementsByTagName("subject")),
297 'date': gettext(c.getElementsByTagName("date")),
298 'body': gettext(c.getElementsByTagName("body")),
299 'state': metacache[id]['state'],
301 if usermap.has_key(c.getAttribute("posterid")):
302 comment["user"] = usermap[c.getAttribute("posterid")]
304 entry = xml.dom.minidom.parse("%s/C-%s" % (Journal, jitemid))
306 entry = xml.dom.minidom.getDOMImplementation().createDocument(None, "comments", None)
308 for d in entry.getElementsByTagName("comment"):
309 if int(d.getElementsByTagName("id")[0].firstChild.nodeValue) == id:
313 print "Warning: downloaded duplicate comment id %d in jitemid %s" % (id, jitemid)
315 entry.documentElement.appendChild(createxml(entry, "comment", comment))
316 f = codecs.open("%s/C-%s" % (Journal, jitemid), "w", "UTF-8")
322 if maxid >= newmaxid:
327 writelast(Journal, lastsync, lastmaxid)
329 if Username == Journal:
330 print "Fetching userpics for: %s" % Username
331 f = open("%s/userpics.xml" % Username, "w")
332 print >>f, """<?xml version="1.0"?>"""
333 print >>f, "<userpics>"
335 print >>f, """<userpic keyword="%s" url="%s" />""" % (p, userpics[p])
336 pic = urllib2.urlopen(userpics[p])
337 ext = MimeExtensions.get(pic.info()["Content-Type"], "")
338 picfn = re.sub(r'[*?\\/:<>"|]', "_", p)
340 picfn = codecs.utf_8_decode(picfn)[0]
341 picf = open("%s/%s%s" % (Username, picfn, ext), "wb")
343 # for installations where the above utf_8_decode doesn't work
344 picfn = "".join([ord(x) < 128 and x or "_" for x in picfn])
345 picf = open("%s/%s%s" % (Username, picfn, ext), "wb")
346 shutil.copyfileobj(pic, picf)
349 print >>f, "</userpics>"
353 print "%d new entries, %d new comments (since %s)" % (newentries, newcomments, origlastsync)
355 print "%d new entries, %d new comments" % (newentries, newcomments)
357 print "%d errors" % errors
359 if __name__ == "__main__":
360 if os.access("ljdump.config", os.F_OK):
361 config = xml.dom.minidom.parse("ljdump.config")
362 server = config.documentElement.getElementsByTagName("server")[0].childNodes[0].data
363 username = config.documentElement.getElementsByTagName("username")[0].childNodes[0].data
364 password = config.documentElement.getElementsByTagName("password")[0].childNodes[0].data
365 journals = config.documentElement.getElementsByTagName("journal")
368 ljdump(server, username, password, e.childNodes[0].data)
370 ljdump(server, username, password, username)
372 from getpass import getpass
373 print "ljdump - livejournal archiver"
375 print "Enter your Livejournal username and password."
377 server = "http://livejournal.com"
378 username = raw_input("Username: ")
379 password = getpass("Password: ")
381 print "You may back up either your own journal, or a community."
382 print "If you are a community maintainer, you can back up both entries and comments."
383 print "If you are not a maintainer, you can back up only entries."
385 journal = raw_input("Journal to back up (or hit return to back up '%s'): " % username)
388 ljdump(server, username, password, journal)
390 ljdump(server, username, password, username)