Amazon POD Processing Script

Download code.

Return to article.

Released by Dianne Dietrich under a GNU General Public License

#!/usr/bin/env python

# Amazon POD Processing Script for Code4Lib

import sys, re

try:
    filename = sys.argv[1]
except:
    print "No input file entered. Exiting."
    sys.exit(1)

file = open(filename)

for line in file.readlines():
    # Clear out name fields
    title = 'NONE'
    firstname = 'NONE'
    middlename = 'NONE'
    lastname = 'NONE'
    suffix = 'NONE'
    contributor = 'NONE'

    # Reset nochange flag
    # nochange will be "True" only in certain situations
    # this will help us identify cases we need to examine more closely
    nochange = False

    curr = line.strip()

    # Flag for no change if there's a place name anywhere in the string
    if re.compile('Congress|Atlanta|Ontario|Halifax|Association|Missouri|
United States|Republican|Confederate|Democratic|New York|Rhode Island|
Massachusetts|Pennsylvania|Great Britain|Florida|University|College|Boston|
England|Washington D\.C\.|\[|\]').search(curr):
        nochange = True

    # Flag for no change if there is a state name anywhere in the string
    if re.compile(' (Ala|Ark|Ariz|Calif|Colo|Conn|Del|Fla|Ga|Ill|Ind|Kan|Ky|La|
Me|Mass|Md|Mich|Minn|Mo|Miss|Mont|N\.C|N\.D|Neb|N\.H|N\.J|N\.M|Nev|N\.Y|Okla|
Ore|Pa|R\.I|S\.C|S\.D|Tenn|Vt|Va|Wash|Wis|W\.Va|Wyo)\.').search(curr):
        nochange = True

    # Split into columns
    authorstring = curr.split(',')

    # Run through the scenarios

    # Flag for nochange if there is only one field
    if (len(authorstring) == 1):
        nochange = True

    elif (len(authorstring) == 2):

        # Flag for nochange if "first name" is a year
        if re.compile('\d{4}[\.\-,]*$').search(authorstring[1]):
            nochange = True
        else:
            title = ''
            firstname = authorstring[1].lstrip(' ')
            middlename = ''
            lastname = authorstring[0]
            suffix = ''
            contributor = ''

    elif (len(authorstring) == 3):

        # Setting up the scenarios
        isBaronOf = re.compile('(.* )(marquis|vicompte|prince|marquise|marquess|
marchioness|graf|grafin|furst|freiherr|freifrau|earl|duke|duchess|duchesse|countess|
viscount|count|conte|conti|compte|baron|apb\.|bishop|bp\.|duc|comte|vicomte|
lady) (of|de|von|de|d\'|di)$', re.I).search(authorstring[2])

        isOf = re.compile('of (.*)$', re.I).match(authorstring[2].lstrip(' '))

        isBaron = re.compile('(.* )(mrs|abp|viscountess|hrabe|saint|princess|prince|lord|
lady|hon|hon|graf|father|earl|duc|dame|countess|count|viscount|contessa|conte|comtesse|
compte|chaplain|cardinal|bp|baroness|barone|baron|abbe|sir|rev|mme|kniaz|colonel|
freiherr)(\.*)$', re.I).search(authorstring[2])

        isContrib = re.compile('(ed\.|tr\.|ed\. and tr\.|pseud\.|comp\.)$', 
re.I).search(authorstring[2].lstrip(' '))

        isSuffix = re.compile('(.* (of|de) .*)|ph\.\s*d\.|m\.\s*d\.|jr\.*', 
re.I).search(authorstring[2].lstrip(' '))

        if isBaronOf:
            title = ''
            firstname = authorstring[1].lstrip(' ')
            middlename = ''
            lastname = '%s%s %s %s' % (isBaronOf.group(1).lstrip(' '), 
isBaronOf.group(2).capitalize(), isBaronOf.group(3), authorstring[0])
            suffix = ''
            contributor = ''

        elif isOf:
            title = ''
            firstname = authorstring[1].lstrip(' ')
            middlename = ''
            lastname = authorstring[0]
            suffix = isOf.group(0)
            contributor = ''

        elif isBaron:
            if authorstring[1].lstrip(' ').startswith('de') or authorstring[1].lstrip(' ').startswith('of'):
                title = "%s%s" % (isBaron.group(1).lstrip(' '), isBaron.group(2).capitalize(),)
                firstname = authorstring[0]                
                middlename = ''
                lastname = authorstring[1].lstrip(' ')
                suffix = ''
                contributor = ''
            else:
            # Catch duplicate last names
                tmp = r'(%s\b)'
                isDuplicate = re.compile(tmp % authorstring[0]).search(authorstring[1].lstrip(' '))

                if isDuplicate:
                    title = "%s%s" % (isBaron.group(1).lstrip(' '), isBaron.group(2).capitalize())
                    firstname = re.compile(isDuplicate.group(1)).sub('',authorstring[1].lstrip(' ')).rstrip(' ')
                    middlename = ''
                    lastname = authorstring[0]
                    suffix = ''
                    contributor = ''

                else:
                    title = "%s%s%s" % (isBaron.group(1).lstrip(' '), isBaron.group(2).capitalize(), isBaron.group(3))
                    firstname = authorstring[1].lstrip(' ')
                    middlename = ''
                    lastname = authorstring[0]
                    suffix = ''
                    contributor = ''

        elif isContrib:
            title = ''
            firstname = authorstring[1].lstrip(' ')
            middlename = ''
            lastname = authorstring[0]
            suffix = ''
            contributor = isContrib.group(0).capitalize()

        elif isSuffix:
            title = ''
            firstname = authorstring[1].lstrip(' ')
            middlename = ''
            lastname = authorstring[0]
            suffix = "%s%s" % (isSuffix.group(0)[0].capitalize(), isSuffix.group(0)[1:])
            contributor = ''

        else:
            nochange = True

    else:
        nochange = True

    if nochange:
        title = ''
        firstname = ''
        middlename = ''
        lastname = curr
        suffix = ''
        contributor = ''

    print '\t'.join([title, firstname, middlename, lastname, suffix, contributor, str(nochange)])