Released by Dianne Dietrich under a GNU General Public License
#!/usr/bin/env python # Amazon POD Processing Script for Code4Lib import sys, re try: filename = sys.argv[1] except: print "No input file entered. Exiting." sys.exit(1) file = open(filename) for line in file.readlines(): # Clear out name fields title = 'NONE' firstname = 'NONE' middlename = 'NONE' lastname = 'NONE' suffix = 'NONE' contributor = 'NONE' # Reset nochange flag # nochange will be "True" only in certain situations # this will help us identify cases we need to examine more closely nochange = False curr = line.strip() # Flag for no change if there's a place name anywhere in the string if re.compile('Congress|Atlanta|Ontario|Halifax|Association|Missouri| ➥United States|Republican|Confederate|Democratic|New York|Rhode Island| ➥Massachusetts|Pennsylvania|Great Britain|Florida|University|College|Boston| ➥England|Washington D\.C\.|\[|\]').search(curr): nochange = True # Flag for no change if there is a state name anywhere in the string if re.compile(' (Ala|Ark|Ariz|Calif|Colo|Conn|Del|Fla|Ga|Ill|Ind|Kan|Ky|La| ➥Me|Mass|Md|Mich|Minn|Mo|Miss|Mont|N\.C|N\.D|Neb|N\.H|N\.J|N\.M|Nev|N\.Y|Okla| ➥Ore|Pa|R\.I|S\.C|S\.D|Tenn|Vt|Va|Wash|Wis|W\.Va|Wyo)\.').search(curr): nochange = True # Split into columns authorstring = curr.split(',') # Run through the scenarios # Flag for nochange if there is only one field if (len(authorstring) == 1): nochange = True elif (len(authorstring) == 2): # Flag for nochange if "first name" is a year if re.compile('\d{4}[\.\-,]*$').search(authorstring[1]): nochange = True else: title = '' firstname = authorstring[1].lstrip(' ') middlename = '' lastname = authorstring[0] suffix = '' contributor = '' elif (len(authorstring) == 3): # Setting up the scenarios isBaronOf = re.compile('(.* )(marquis|vicompte|prince|marquise|marquess| ➥marchioness|graf|grafin|furst|freiherr|freifrau|earl|duke|duchess|duchesse|countess| ➥viscount|count|conte|conti|compte|baron|apb\.|bishop|bp\.|duc|comte|vicomte| ➥lady) (of|de|von|de|d\'|di)$', re.I).search(authorstring[2]) isOf = re.compile('of (.*)$', re.I).match(authorstring[2].lstrip(' ')) isBaron = re.compile('(.* )(mrs|abp|viscountess|hrabe|saint|princess|prince|lord| ➥lady|hon|hon|graf|father|earl|duc|dame|countess|count|viscount|contessa|conte|comtesse| ➥compte|chaplain|cardinal|bp|baroness|barone|baron|abbe|sir|rev|mme|kniaz|colonel| ➥freiherr)(\.*)$', re.I).search(authorstring[2]) isContrib = re.compile('(ed\.|tr\.|ed\. and tr\.|pseud\.|comp\.)$', ➥re.I).search(authorstring[2].lstrip(' ')) isSuffix = re.compile('(.* (of|de) .*)|ph\.\s*d\.|m\.\s*d\.|jr\.*', ➥re.I).search(authorstring[2].lstrip(' ')) if isBaronOf: title = '' firstname = authorstring[1].lstrip(' ') middlename = '' lastname = '%s%s %s %s' % (isBaronOf.group(1).lstrip(' '), ➥isBaronOf.group(2).capitalize(), isBaronOf.group(3), authorstring[0]) suffix = '' contributor = '' elif isOf: title = '' firstname = authorstring[1].lstrip(' ') middlename = '' lastname = authorstring[0] suffix = isOf.group(0) contributor = '' elif isBaron: if authorstring[1].lstrip(' ').startswith('de') or authorstring[1].lstrip(' ').startswith('of'): title = "%s%s" % (isBaron.group(1).lstrip(' '), isBaron.group(2).capitalize(),) firstname = authorstring[0] middlename = '' lastname = authorstring[1].lstrip(' ') suffix = '' contributor = '' else: # Catch duplicate last names tmp = r'(%s\b)' isDuplicate = re.compile(tmp % authorstring[0]).search(authorstring[1].lstrip(' ')) if isDuplicate: title = "%s%s" % (isBaron.group(1).lstrip(' '), isBaron.group(2).capitalize()) firstname = re.compile(isDuplicate.group(1)).sub('',authorstring[1].lstrip(' ')).rstrip(' ') middlename = '' lastname = authorstring[0] suffix = '' contributor = '' else: title = "%s%s%s" % (isBaron.group(1).lstrip(' '), isBaron.group(2).capitalize(), isBaron.group(3)) firstname = authorstring[1].lstrip(' ') middlename = '' lastname = authorstring[0] suffix = '' contributor = '' elif isContrib: title = '' firstname = authorstring[1].lstrip(' ') middlename = '' lastname = authorstring[0] suffix = '' contributor = isContrib.group(0).capitalize() elif isSuffix: title = '' firstname = authorstring[1].lstrip(' ') middlename = '' lastname = authorstring[0] suffix = "%s%s" % (isSuffix.group(0)[0].capitalize(), isSuffix.group(0)[1:]) contributor = '' else: nochange = True else: nochange = True if nochange: title = '' firstname = '' middlename = '' lastname = curr suffix = '' contributor = '' print '\t'.join([title, firstname, middlename, lastname, suffix, contributor, str(nochange)])