From: Victor Wagner <vitus@wagner.pp.ru>
Date: Thu, 24 Sep 2015 06:33:25 +0000 (+0300)
Subject: Add some 0.2s delays go conform to new bot policy. Correctly handle disallowing for... 
X-Git-Url: http://wagner.pp.ru/gitweb/?a=commitdiff_plain;h=283cbafcc92879ab5eaa29cbc9ffc854d340af79;p=oss%2Fljdump.git

Add some 0.2s delays go conform to new bot policy. Correctly handle disallowing for an hour if delays don't help. Try to recover from not wellformed XML in export_comments.bml
---

diff --git a/ljdump.py b/ljdump.py
index 591a838..458dbd9 100755
--- a/ljdump.py
+++ b/ljdump.py
@@ -25,6 +25,7 @@
 # Copyright (c) 2005-2010 Greg Hewgill and contributors
 
 import codecs, os, pickle, pprint, re, shutil, sys, urllib2, xml.dom.minidom, xmlrpclib
+import time
 from xml.sax import saxutils
 
 MimeExtensions = {
@@ -167,6 +168,7 @@ def ljdump(Server, Username, Password, Journal):
         userpics['*'] = r['defaultpicurl']
 
     while True:
+        time.sleep(0.2)
         r = server.LJ.XMLRPC.syncitems(dochallenge(server, {
             'username': Username,
             'ver': 1,
@@ -180,6 +182,7 @@ def ljdump(Server, Username, Password, Journal):
             if item['item'][0] == 'L':
                 print "Fetching journal entry %s (%s)" % (item['item'], item['action'])
                 try:
+                    time.sleep(0.2)
                     e = server.LJ.XMLRPC.getevents(dochallenge(server, {
                         'username': Username,
                         'ver': 1,
@@ -197,6 +200,10 @@ def ljdump(Server, Username, Password, Journal):
                     print "Error getting item: %s" % item['item']
                     pprint.pprint(x)
                     errors += 1
+                    if str(x).find("will be able to continue posting within an hour."):
+                        print "Waiting a hour"
+                        time.sleep(3600)
+                        continue
             lastsync = item['time']
             writelast(Journal, lastsync, lastmaxid)
 
@@ -241,18 +248,29 @@ def ljdump(Server, Username, Password, Journal):
     maxid = lastmaxid
     while True:
         try:
-            try:
+            try:   
+                time.sleep(0.2)
                 r = urllib2.urlopen(urllib2.Request(Server+"/export_comments.bml?get=comment_meta&startid=%d%s" % (maxid+1, authas), headers = {'Cookie': "ljsession="+ljsession}))
                 meta = xml.dom.minidom.parse(r)
             except Exception, x:
                 print "*** Error fetching comment meta, possibly not community maintainer?"
                 print "***", x
-                break
+                maxid += 200
+                continue 
         finally:
             try:
                 r.close()
             except AttributeError: # r is sometimes a dict for unknown reasons
                 pass
+        nxid=meta.getElementsByTagName("nextid")
+        if len(nxid):
+            nxid = nxid[0].firstChild.nodeValue
+        else:
+            nxid = None
+        print "Got meta data maxid = %d nextid=%s"%(
+            int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue),
+            nxid
+        )
         for c in meta.getElementsByTagName("comment"):
             id = int(c.getAttribute("id"))
             metacache[id] = {
@@ -283,8 +301,10 @@ def ljdump(Server, Username, Password, Journal):
                 meta = xml.dom.minidom.parse(r)
             except Exception, x:
                 print "*** Error fetching comment body, possibly not community maintainer?"
+                print "*** requested id %d "%(maxid+1)
+                maxid+=1
                 print "***", x
-                break
+                continue
         finally:
             r.close()
         for c in meta.getElementsByTagName("comment"):