X-Git-Url: https://wagner.pp.ru/gitweb/?a=blobdiff_plain;f=ljdump.py;h=458dbd91c7decfeff500d33e9638da811b8ea796;hb=283cbafcc92879ab5eaa29cbc9ffc854d340af79;hp=5b7a7a4fbdfe2221685c7710581267420ebb9d9d;hpb=245a811657855e2b846068dccdbbb83ced2c249e;p=oss%2Fljdump.git diff --git a/ljdump.py b/ljdump.py index 5b7a7a4..458dbd9 100755 --- a/ljdump.py +++ b/ljdump.py @@ -2,7 +2,7 @@ # # ljdump.py - livejournal archiver # Greg Hewgill http://hewgill.com -# Version 1.3.2 +# Version 1.5.1 # # LICENSE # @@ -22,9 +22,10 @@ # misrepresented as being the original software. # 3. This notice may not be removed or altered from any source distribution. # -# Copyright (c) 2005-2009 Greg Hewgill +# Copyright (c) 2005-2010 Greg Hewgill and contributors -import codecs, md5, os, pickle, pprint, re, shutil, sys, urllib2, xml.dom.minidom, xmlrpclib +import codecs, os, pickle, pprint, re, shutil, sys, urllib2, xml.dom.minidom, xmlrpclib +import time from xml.sax import saxutils MimeExtensions = { @@ -33,8 +34,14 @@ MimeExtensions = { "image/png": ".png", } +try: + from hashlib import md5 +except ImportError: + import md5 as _md5 + md5 = _md5.new + def calcchallenge(challenge, password): - return md5.new(challenge+md5.new(password).hexdigest()).hexdigest() + return md5(challenge+md5(password).hexdigest()).hexdigest() def flatresponse(response): r = {} @@ -88,8 +95,8 @@ def writedump(fn, event): dumpelement(f, "event", event) f.close() -def writelast(username, lastsync, lastmaxid): - f = open("%s/.last" % username, "w") +def writelast(journal, lastsync, lastmaxid): + f = open("%s/.last" % journal, "w") f.write("%s\n" % lastsync) f.write("%s\n" % lastmaxid) f.close() @@ -107,15 +114,19 @@ def gettext(e): return "" return e[0].firstChild.nodeValue -def ljdump(Server, Username, Password): +def ljdump(Server, Username, Password, Journal): m = re.search("(.*)/interface/xmlrpc", Server) if m: Server = m.group(1) + if Username != Journal: + authas = "&authas=%s" % Journal + else: + authas = "" - print "Fetching journal entries for: %s" % Username + print "Fetching journal entries for: %s" % Journal try: - os.mkdir(Username) - print "Created subdirectory: %s" % Username + os.mkdir(Journal) + print "Created subdirectory: %s" % Journal except: pass @@ -130,7 +141,7 @@ def ljdump(Server, Username, Password): lastsync = "" lastmaxid = 0 try: - f = open("%s/.last" % Username, "r") + f = open("%s/.last" % Journal, "r") lastsync = f.readline() if lastsync[-1] == '\n': lastsync = lastsync[:len(lastsync)-1] @@ -153,13 +164,16 @@ def ljdump(Server, Username, Password): 'getpickwurls': 1, }, Password)) userpics = dict(zip(map(str, r['pickws']), r['pickwurls'])) - userpics['*'] = r['defaultpicurl'] + if r['defaultpicurl']: + userpics['*'] = r['defaultpicurl'] while True: + time.sleep(0.2) r = server.LJ.XMLRPC.syncitems(dochallenge(server, { 'username': Username, 'ver': 1, 'lastsync': lastsync, + 'usejournal': Journal, }, Password)) #pprint.pprint(r) if len(r['syncitems']) == 0: @@ -168,14 +182,16 @@ def ljdump(Server, Username, Password): if item['item'][0] == 'L': print "Fetching journal entry %s (%s)" % (item['item'], item['action']) try: + time.sleep(0.2) e = server.LJ.XMLRPC.getevents(dochallenge(server, { 'username': Username, 'ver': 1, 'selecttype': "one", 'itemid': item['item'][2:], + 'usejournal': Journal, }, Password)) if e['events']: - writedump("%s/%s" % (Username, item['item']), e['events'][0]) + writedump("%s/%s" % (Journal, item['item']), e['events'][0]) newentries += 1 else: print "Unexpected empty item: %s" % item['item'] @@ -184,8 +200,12 @@ def ljdump(Server, Username, Password): print "Error getting item: %s" % item['item'] pprint.pprint(x) errors += 1 + if str(x).find("will be able to continue posting within an hour."): + print "Waiting a hour" + time.sleep(3600) + continue lastsync = item['time'] - writelast(Username, lastsync, lastmaxid) + writelast(Journal, lastsync, lastmaxid) # The following code doesn't work because the server rejects our repeated calls. # http://www.livejournal.com/doc/server/ljp.csp.xml-rpc.getevents.html @@ -205,21 +225,21 @@ def ljdump(Server, Username, Password): # if len(r['events']) == 0: # break # for item in r['events']: - # writedump("%s/L-%d" % (Username, item['itemid']), item) + # writedump("%s/L-%d" % (Journal, item['itemid']), item) # newentries += 1 # lastsync = item['eventtime'] - print "Fetching journal comments for: %s" % Username + print "Fetching journal comments for: %s" % Journal try: - f = open("%s/comment.meta" % Username) + f = open("%s/comment.meta" % Journal) metacache = pickle.load(f) f.close() except: metacache = {} try: - f = open("%s/user.map" % Username) + f = open("%s/user.map" % Journal) usermap = pickle.load(f) f.close() except: @@ -227,9 +247,30 @@ def ljdump(Server, Username, Password): maxid = lastmaxid while True: - r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_meta&startid=%d" % (maxid+1), headers = {'Cookie': "ljsession="+ljsession})) - meta = xml.dom.minidom.parse(r) - r.close() + try: + try: + time.sleep(0.2) + r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_meta&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession})) + meta = xml.dom.minidom.parse(r) + except Exception, x: + print "*** Error fetching comment meta, possibly not community maintainer?" + print "***", x + maxid += 200 + continue + finally: + try: + r.close() + except AttributeError: # r is sometimes a dict for unknown reasons + pass + nxid=meta.getElementsByTagName("nextid") + if len(nxid): + nxid = nxid[0].firstChild.nodeValue + else: + nxid = None + print "Got meta data maxid = %d nextid=%s"%( + int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue), + nxid + ) for c in meta.getElementsByTagName("comment"): id = int(c.getAttribute("id")) metacache[id] = { @@ -243,42 +284,29 @@ def ljdump(Server, Username, Password): if maxid >= int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue): break - f = open("%s/comment.meta" % Username, "w") + f = open("%s/comment.meta" % Journal, "w") pickle.dump(metacache, f) f.close() - f = open("%s/user.map" % Username, "w") + f = open("%s/user.map" % Journal, "w") pickle.dump(usermap, f) f.close() - print "Fetching userpics for: %s" % Username - f = open("%s/userpics.xml" % Username, "w") - print >>f, """""" - print >>f, "" - for p in userpics: - print >>f, """""" % (p, userpics[p]) - pic = urllib2.urlopen(userpics[p]) - ext = MimeExtensions.get(pic.info()["Content-Type"], "") - picfn = re.sub(r'[*?\\/:<>"|]', "_", p) - try: - picfn = codecs.utf_8_decode(picfn)[0] - picf = open("%s/%s%s" % (Username, picfn, ext), "wb") - except: - # for installations where the above utf_8_decode doesn't work - picfn = "".join([ord(x) < 128 and x or "_" for x in picfn]) - picf = open("%s/%s%s" % (Username, picfn, ext), "wb") - shutil.copyfileobj(pic, picf) - pic.close() - picf.close() - print >>f, "" - f.close() - newmaxid = maxid maxid = lastmaxid while True: - r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_body&startid=%d" % (maxid+1), headers = {'Cookie': "ljsession="+ljsession})) - meta = xml.dom.minidom.parse(r) - r.close() + try: + try: + r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_body&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession})) + meta = xml.dom.minidom.parse(r) + except Exception, x: + print "*** Error fetching comment body, possibly not community maintainer?" + print "*** requested id %d "%(maxid+1) + maxid+=1 + print "***", x + continue + finally: + r.close() for c in meta.getElementsByTagName("comment"): id = int(c.getAttribute("id")) jitemid = c.getAttribute("jitemid") @@ -293,7 +321,7 @@ def ljdump(Server, Username, Password): if usermap.has_key(c.getAttribute("posterid")): comment["user"] = usermap[c.getAttribute("posterid")] try: - entry = xml.dom.minidom.parse("%s/C-%s" % (Username, jitemid)) + entry = xml.dom.minidom.parse("%s/C-%s" % (Journal, jitemid)) except: entry = xml.dom.minidom.getDOMImplementation().createDocument(None, "comments", None) found = False @@ -305,7 +333,7 @@ def ljdump(Server, Username, Password): print "Warning: downloaded duplicate comment id %d in jitemid %s" % (id, jitemid) else: entry.documentElement.appendChild(createxml(entry, "comment", comment)) - f = codecs.open("%s/C-%s" % (Username, jitemid), "w", "UTF-8") + f = codecs.open("%s/C-%s" % (Journal, jitemid), "w", "UTF-8") entry.writexml(f) f.close() newcomments += 1 @@ -316,7 +344,30 @@ def ljdump(Server, Username, Password): lastmaxid = maxid - writelast(Username, lastsync, lastmaxid) + writelast(Journal, lastsync, lastmaxid) + + if Username == Journal: + print "Fetching userpics for: %s" % Username + f = open("%s/userpics.xml" % Username, "w") + print >>f, """""" + print >>f, "" + for p in userpics: + print >>f, """""" % (p, userpics[p]) + pic = urllib2.urlopen(userpics[p]) + ext = MimeExtensions.get(pic.info()["Content-Type"], "") + picfn = re.sub(r'[*?\\/:<>"|]', "_", p) + try: + picfn = codecs.utf_8_decode(picfn)[0] + picf = open("%s/%s%s" % (Username, picfn, ext), "wb") + except: + # for installations where the above utf_8_decode doesn't work + picfn = "".join([ord(x) < 128 and x or "_" for x in picfn]) + picf = open("%s/%s%s" % (Username, picfn, ext), "wb") + shutil.copyfileobj(pic, picf) + pic.close() + picf.close() + print >>f, "" + f.close() if origlastsync: print "%d new entries, %d new comments (since %s)" % (newentries, newcomments, origlastsync) @@ -331,6 +382,12 @@ if __name__ == "__main__": server = config.documentElement.getElementsByTagName("server")[0].childNodes[0].data username = config.documentElement.getElementsByTagName("username")[0].childNodes[0].data password = config.documentElement.getElementsByTagName("password")[0].childNodes[0].data + journals = config.documentElement.getElementsByTagName("journal") + if journals: + for e in journals: + ljdump(server, username, password, e.childNodes[0].data) + else: + ljdump(server, username, password, username) else: from getpass import getpass print "ljdump - livejournal archiver" @@ -341,4 +398,14 @@ if __name__ == "__main__": username = raw_input("Username: ") password = getpass("Password: ") print - ljdump(server, username, password) + print "You may back up either your own journal, or a community." + print "If you are a community maintainer, you can back up both entries and comments." + print "If you are not a maintainer, you can back up only entries." + print + journal = raw_input("Journal to back up (or hit return to back up '%s'): " % username) + print + if journal: + ljdump(server, username, password, journal) + else: + ljdump(server, username, password, username) +# vim:ts=4 et: