X-Git-Url: https://wagner.pp.ru/gitweb/?a=blobdiff_plain;ds=sidebyside;f=convertdump.py;h=0f59e8282d6dc9e2e9fa481a1312201b6468c2bf;hb=HEAD;hp=5ea760087d6a49f03b3d9d2aa46c28c7b1f76ad4;hpb=ecc0f70e4520a0b48b3ff8f4c6aa65948c951574;p=oss%2Fljdump.git diff --git a/convertdump.py b/convertdump.py index 5ea7600..0f59e82 100755 --- a/convertdump.py +++ b/convertdump.py @@ -29,6 +29,8 @@ import xml.dom.minidom import os import codecs import sys +import getopt +import re from time import strptime, strftime @@ -84,7 +86,9 @@ def addEntryForId(outDoc, element, username, id, includeSecure): # Create an event node (special case because for some reason there are two # 'event' elements in the pydump output, which is probably LJ's fault) event = inDoc.getElementsByTagName("event")[0] - appendTextNode(outDoc, entry, "event", getNodeText(event, "event")) + eventText = getNodeText(event, "event") + + appendTextNode(outDoc, entry, "event", replaceLJTags(eventText)) security = getNodeText(inDoc, "security") @@ -151,8 +155,8 @@ def addCommentsForId(outDoc, entry, username, id): getNodeText(comment, "subject")) # Create an event element - appendTextNode(outDoc, outComment, "event", - getNodeText(comment, "body")) + bodyText = getNodeText(comment, "body") + appendTextNode(outDoc, outComment, "event", replaceLJTags(bodyText)) # Create the author element author = outDoc.createElement("author") @@ -171,25 +175,94 @@ def addCommentsForId(outDoc, entry, username, id): if(parentId != ""): appendTextNode(outDoc, outComment, "parent_itemid", parentId) + +# regular expressions used in replaceLJTags() +# (global for later reuse - suggestion by jparise) + +userRE = re.compile('', re.IGNORECASE) +commRE = re.compile('', re.IGNORECASE) +namedCutRE = re.compile('', + re.IGNORECASE|re.DOTALL) +cutRE = re.compile('', re.IGNORECASE) +cutRE = re.compile('', re.IGNORECASE) +embedRE = re.compile('', re.IGNORECASE) + +def replaceLJTags(entry): + rv = entry + + # replace lj user tags + rv = re.sub(userRE, '\\1', rv) + + # replace lj comm tags + rv = re.sub(commRE, '\\1', rv) + + # replace lj-cut tags + rv = re.sub(namedCutRE, '', rv) + rv = re.sub(cutRE, '', rv) + rv = re.sub(cutRE, '', rv) + + # replace lj-embed tags + # this doesn't actually work. LJ doesn't include the embedded content + # when ljdump calls 'getevents', but instead includes an lj-embed tag + # with an id and nothing else. + #rv = re.sub(embedRE, '', rv) + + return rv + + +def usage(): + print( "Usage: convertdump.py [arguments]" ) + print( """ +This will convert a pydump archive into something compatible with the +WordPress LiveJournal importer. This is the same format used by the Windows +ljArchive exporter. + +Arguments: + -u --user username of archive to process [required] + -l --limit limit the number of entries in each xml file (default 250) + -i --insecure include private and protected entries in the output + -h --help show this help page + +Example: + ./convertdump.py --user stevemartin --limit 200 --insecure +""") + + def main(argv): username = "" entryLimit = 250 includeSecure = False; - - if( len(argv) < 2 ): - print( "Usage: convertdump.py " ) - return - else: - username = argv[0] - entryLimit = int(argv[1]) - try: - includeSecure = bool(argv[2]) - except IndexError: - includeSecure = False + if( len(argv) == 0 ): + usage() + sys.exit(2) - if(includeSecure == True): - print( "Warning: Including secure entries in XML output" ) + try: + opts, args = getopt.getopt(sys.argv[1:], "hu:l:i", ["help", + "user=", + "limit=", + "insecure"]) + except getopt.GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + + for o, a in opts: + if o == "-v": + verbose = True + elif o in ("-u", "--user"): + username = a + elif o in ("-l", "--limit"): + entryLimit = int(a) + elif o in ("-i", "--insecure"): + print( "Warning: Including secure entries in XML output" ) + includeSecure = True + elif o in ("-h", "--help"): + usage() + sys.exit() + else: + assert False, "unhandled option" userDir = os.listdir(username)