Add some 0.2s delays go conform to new bot policy. Correctly handle disallowing for...

[oss/ljdump.git] / ljdump.py
diff --git a/ljdump.py b/ljdump.py

index 5cd095a6e619e464c020c436186419acde9c7d1c..458dbd91c7decfeff500d33e9638da811b8ea796 100755 (executable)
--- a/ljdump.py
+++ b/ljdump.py
@@ -2,7 +2,7 @@
  #
  # ljdump.py - livejournal archiver
  # Greg Hewgill <greg@hewgill.com> http://hewgill.com
-# Version 1.4
+# Version 1.5.1
  #
  # LICENSE
  #
@@ -22,9 +22,10 @@
  #    misrepresented as being the original software.
  # 3. This notice may not be removed or altered from any source distribution.
  #
-# Copyright (c) 2005-2009 Greg Hewgill
+# Copyright (c) 2005-2010 Greg Hewgill and contributors
  
-import codecs, md5, os, pickle, pprint, re, shutil, sys, urllib2, xml.dom.minidom, xmlrpclib
+import codecs, os, pickle, pprint, re, shutil, sys, urllib2, xml.dom.minidom, xmlrpclib
+import time
  from xml.sax import saxutils
  
  MimeExtensions = {
@@ -33,8 +34,14 @@ MimeExtensions = {
      "image/png": ".png",
  }
  
+try:
+    from hashlib import md5
+except ImportError:
+    import md5 as _md5
+    md5 = _md5.new
+
  def calcchallenge(challenge, password):
-    return md5.new(challenge+md5.new(password).hexdigest()).hexdigest()
+    return md5(challenge+md5(password).hexdigest()).hexdigest()
  
  def flatresponse(response):
      r = {}
@@ -157,9 +164,11 @@ def ljdump(Server, Username, Password, Journal):
          'getpickwurls': 1,
      }, Password))
      userpics = dict(zip(map(str, r['pickws']), r['pickwurls']))
-    userpics['*'] = r['defaultpicurl']
+    if r['defaultpicurl']:
+        userpics['*'] = r['defaultpicurl']
  
      while True:
+        time.sleep(0.2)
          r = server.LJ.XMLRPC.syncitems(dochallenge(server, {
              'username': Username,
              'ver': 1,
@@ -173,6 +182,7 @@ def ljdump(Server, Username, Password, Journal):
              if item['item'][0] == 'L':
                  print "Fetching journal entry %s (%s)" % (item['item'], item['action'])
                  try:
+                    time.sleep(0.2)
                      e = server.LJ.XMLRPC.getevents(dochallenge(server, {
                          'username': Username,
                          'ver': 1,
@@ -190,6 +200,10 @@ def ljdump(Server, Username, Password, Journal):
                      print "Error getting item: %s" % item['item']
                      pprint.pprint(x)
                      errors += 1
+                    if str(x).find("will be able to continue posting within an hour."):
+                        print "Waiting a hour"
+                        time.sleep(3600)
+                        continue
              lastsync = item['time']
              writelast(Journal, lastsync, lastmaxid)
  
@@ -234,14 +248,29 @@ def ljdump(Server, Username, Password, Journal):
      maxid = lastmaxid
      while True:
          try:
-            try:
+            try:   
+                time.sleep(0.2)
                  r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_meta&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
                  meta = xml.dom.minidom.parse(r)
-            except:
+            except Exception, x:
                  print "*** Error fetching comment meta, possibly not community maintainer?"
-                break
+                print "***", x
+                maxid += 200
+                continue 
          finally:
-            r.close()
+            try:
+                r.close()
+            except AttributeError: # r is sometimes a dict for unknown reasons
+                pass
+        nxid=meta.getElementsByTagName("nextid")
+        if len(nxid):
+            nxid = nxid[0].firstChild.nodeValue
+        else:
+            nxid = None
+        print "Got meta data maxid = %d nextid=%s"%(
+            int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue),
+            nxid
+        )
          for c in meta.getElementsByTagName("comment"):
              id = int(c.getAttribute("id"))
              metacache[id] = {
@@ -270,9 +299,12 @@ def ljdump(Server, Username, Password, Journal):
              try:
                  r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_body&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
                  meta = xml.dom.minidom.parse(r)
-            except:
+            except Exception, x:
                  print "*** Error fetching comment body, possibly not community maintainer?"
-                break
+                print "*** requested id %d "%(maxid+1)
+                maxid+=1
+                print "***", x
+                continue
          finally:
              r.close()
          for c in meta.getElementsByTagName("comment"):
@@ -376,3 +408,4 @@ if __name__ == "__main__":
              ljdump(server, username, password, journal)
          else:
              ljdump(server, username, password, username)
+# vim:ts=4 et: