import logging import os import re import pprint import shutil import sys import urllib logger = logging.getLogger('pubman') #logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.DEBUG) """ The etree importing try/except handling was commented out by Maurits on 11 March 2009 after Mark got a test failure which was very likely due to having a newer lxml on his machine. The server at machine currently does not have lxml installed so it uses another (slightly slower) standard implementation. We could put lxml in the buildout, but that has given us problems in the past, at least on Apple Macs. So we just pick one import from the tree below. Note that speed is not really important for us as this is only used in a nightly cron job. Actually, since elementtree is not always available, and other users (at puman) may use python2.5, we only do not try to import lxml. # etree import routine from the website. This makes sure it can be loaded # everywhere, falling back to ever slower implementations. # try: # from lxml import etree # logger.debug("running with lxml.etree") # except ImportError: """ try: # Python 2.5 import xml.etree.cElementTree as etree logger.debug("running with cElementTree on Python 2.5+") except ImportError: try: # Python 2.5 import xml.etree.ElementTree as etree logger.debug("running with ElementTree on Python 2.5+") except ImportError: try: # normal cElementTree install import cElementTree as etree logger.debug("running with cElementTree") except ImportError: try: # normal ElementTree install import elementtree.ElementTree as etree logger.debug("running with ElementTree") except ImportError: logger.critical("Failed to import ElementTree from any " "known place") sys.exit(1) DCTERMS = '{http://purl.org/dc/terms/}' DC = '{http://purl.org/dc/elements/1.1/}' PUBLICATION = '{http://escidoc.mpg.de/metadataprofile/schema/0.1/publication}' ESCIDOC = '{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}' ESCIDOCCOMPONENTS = '{http://www.escidoc.de/schemas/components/0.7}' ESCIDOCMETADATAPROFILE = '{http://escidoc.mpg.de/metadataprofile/schema/0.1/}' XLINK = '{http://www.w3.org/1999/xlink}' PROP = '{http://escidoc.de/core/01/properties/}' XSI = '{http://www.w3.org/2001/XMLSchema-instance}' # Get a file to put the source in; partly this is so we can # look at that file afterwards, partly it is because on the server # an elementtree package is used that does not know how to # download stuff. target_dir = os.environ.get('CLIENT_HOME', os.getcwd()) TARGET_FILE_NAME = os.path.join(target_dir, 'latest_pubman_export.xml') # Location to use as default source of the xml. # # Note: this is ONLY used when the pubman.py script is called directly # from the command line (so not within Plone) when no specific source # is specified. The ONLY time it is used from within Plone is during # install/upgrade when setting a property in # portal_properties/pubman_settings when that property does not exist # yet. If the property exists, the url here is NOT used in Plone. # # Note: this must NOT contain spaces (replace them by %20) PUBMAN_URL = ('http://pubman.mpdl.mpg.de/search/SearchAndExport?' 'cqlQuery=escidoc.content-model.objid=escidoc:persistent4%20' 'AND%20escidoc.context.objid%20any%20"escidoc:54203%20escidoc:61348%20' 'escidoc:57277"&exportFormat=APA&outputFormat=snippet&language=all&' 'sortKeys=&sortOrder=ascending&startRecord=&maximumRecords=') #__DOCSTART__ <= Comment for documentation generation. # PUBLISHER_VERSION private components result in a 'request reprint' link. PUBLISHER_VERSION = ('PUBLISHER_VERSION', 'publisher-version') # FULLTEXT_CATEGORIES components are shown as the full text links. FULLTEXT_CATEGORIES = PUBLISHER_VERSION + ('ANY_FULLTEXT', 'any-fulltext') # PREPRINT, POSTPRINT components are shown as pre/post print links. PREPRINT = ('PRE_PRINT', 'pre-print') POSTPRINT = ('POST_PRINT', 'post-print') # TOC_CATEGORIES components are shown as table of contents links. TOC_CATEGORIES = ('TOC', 'TABLE_OF_CONTENTS', 'toc', 'table-of-contents') #__DOCEND__ <= The stuff above is returned by ``pubman.py doc``. def split_subjects(text): """Return cleaned-up subjects from long string""" if not text: return '' bits = text.split(';') smallbits = [] for bit in bits: smallbits += bit.split(',') smallbits = [smallbit.lower().strip() for smallbit in smallbits] # Filter out empty ones smallbits = [smallbit.encode('utf-8') for smallbit in smallbits if smallbit] return smallbits def extract_groups(name): """Extract groups, if possible. Spit out all comma-separated individual values. """ return (item.strip() for item in name.split(',')) def extract_year_or_status(citation): """Extract either the year or the status from the formatted citation. The year or status is enclosed by a with a special class:: >>> cit = 'Some (2008). Thing' >>> cit2 = 'Some (In print). Thing' >>> junk = 'Some ( (2008). ). Thing' >>> nothing = 'Something without a match.' >>> nested = 'Some ( 2007 ). ' When there's no match, we return 'Unknown':: >>> extract_year_or_status(nothing) 'Unknown' >>> extract_year_or_status(None) 'Unknown' Years and statuses are extracted when present:: >>> extract_year_or_status(cit) '2008' >>> extract_year_or_status(cit2) 'In print' And also when nested in another span"" >>> extract_year_or_status(nested) '2007' And spaces, dots and parentheses get stripped:: >>> extract_year_or_status(junk) '2008' """ default = 'Unknown' if citation is None: return default pattern = re.compile(r""" # Optional whitespace and closing > (?P.*?) # The actual status, non-greedy # Closing tag """, re.VERBOSE) match = pattern.search(citation) if match: year_or_status = match.group('status').strip() logger.debug("Found year or status: %r", year_or_status) year_or_status = year_or_status.replace('(', '') year_or_status = year_or_status.replace(')', '') year_or_status = year_or_status.replace('.', '') return year_or_status return default def download_source(source): """Download (or copy) the source and put it in the TARGET_FILE_NAME source can be a file url, http(s) url, or a path on the file system. For ease of use by others it returns the path where the source has ended up, which will always be TARGET_FILE_NAME, unless something went wrong. This can be handy if you want to parse the source once for validation and then a second time for importing it in e.g. a Plone website. """ # We may be reusing the old file as the new file. if source == TARGET_FILE_NAME: pass elif source == 'file://' + TARGET_FILE_NAME: source = TARGET_FILE_NAME elif os.path.exists(TARGET_FILE_NAME): # Remove old file. os.unlink(TARGET_FILE_NAME) if source[:4] in ('file', 'http'): target_file = open(TARGET_FILE_NAME, 'w') try: logger.debug("Downloading %s to %s", source, TARGET_FILE_NAME) target_file.write(urllib.urlopen(source).read()) except IOError: # log the error and reraise logger.error("url not found: %r." % source) raise target_file.close() else: if source != TARGET_FILE_NAME: logger.debug("Copying %s to %s", source, TARGET_FILE_NAME) shutil.copy(source, TARGET_FILE_NAME) return TARGET_FILE_NAME def parse(source): logger.debug("Parsing %s", source) target = download_source(source) if target is None: return tree = etree.parse(TARGET_FILE_NAME) root = tree.getroot() logger.info("%s items found.", len(root)) # Note: every comment line starting with '##' is extracted when running # `python pubman.py doc`. It is used to document which fields or # attributes are extracted from the xml file. So keep the double hashes in # place and keep the documentation up-to-date. ## Every item in the root is parsed. We check if it is a escidocItem:item. for item in root: res = {} # Assumption: it is a list of items. assert stripped_tag(item) == 'item' ## Attribute `objid` = interal id of the item. res['id'] = item.get('objid') logger.debug('===================================') logger.info("Extracting %s", res['id']) ## Attribute `last-modification-date` = last modified date. res['modified'] = item.get('last-modification-date') ## A `dcterms:bibliographicCitation` somewhere in the item is used as ## the formatted citation. Any `
` tag is stripped out. citation = item.findtext('.//' + DCTERMS + 'bibliographicCitation') if citation: citation = citation.encode('utf-8') citation = citation.replace('
', '') res['formatted_citation'] = citation ## From this formatted citation, the year (`2007`) or status (`in ## print`) is extracted. res['year_or_status'] = extract_year_or_status( res['formatted_citation']).encode('utf-8') ## `dc:title` = title. title = item.findtext('.//' + DC + 'title') if title: title = title.encode('utf-8') res['title'] = title ## `dc:subjects` is split on both comma and semicolon. The resulting ## words are used as keywords. keywords = item.findtext( './/' + DC + 'subject') res['keywords'] = split_subjects(keywords) authors = [] groups = [] ## Inside an item, every `publication:creator` is extracted. for creator in item.findall('.//' + PUBLICATION + 'creator'): ## * Every creator without an attribute role="author" or "editor" is ignored. if not creator.get('role') in ('author', 'editor'): logger.debug("Found creator without author/editor role: %s.", creator.get('role')) continue ## * Within a creator, every `escidoc:person` is extracted. for author in creator.findall(ESCIDOC + 'person'): full = author.findtext( './' + ESCIDOC + 'complete-name').strip() family = author.findtext( './' + ESCIDOC + 'family-name').strip() given = author.findtext( './' + ESCIDOC + 'given-name').strip() ## * The person's `complete-name` is used if available, ## otherwise `family-name` and `given-name` are ## concatenated. if not full: full = u' '.join([given, family]) full = full.encode('utf-8') authors.append(full) ## * For every person, all `escidoc:organization`s are found ## and their `escidoc:organization-name` extracted. These ## organization names are addedto the list of groups ## of the publication. for org in author.findall( './' + ESCIDOC + 'organization'): name = org.findtext( './' + ESCIDOC + 'organization-name') if name: for group in extract_groups(name): group = group.encode('utf-8') if group not in groups: groups.append(group) ## On the item, the first `escidocmetadataprofile:publication` is ## extracted, pubdata = item.findall('.//' + ESCIDOCMETADATAPROFILE + 'publication') if pubdata: pubdata = pubdata[0] ## the publication's `type` attribute becomes the ## publication_type. publication_type = pubdata.get('type') # ^^^ Non-namespaced at the moment. res['publication_type'] = publication_type.encode('utf-8') res['authors'] = authors res['groups'] = groups ## `dcterms:created` becomes the creation date. If it contains just a ## year ('2003'), it gets converted to ('2003-01-01'). created = item.findtext('.//' + DCTERMS + 'created') if created: if len(created) == 4: # Just '2003'. created = created + '-01-01' res['created'] = created ## The abstract is extracted from `dcterms:abstract`. abstract = item.findtext('.//' + DCTERMS + 'abstract') if abstract: abstract = abstract.encode('utf-8') res['abstract'] = abstract # Extract uri and doi links doi = None uri = None ## Every `dc:identifier` is extracted. If the `xsi:type` attribute ## matches `eidt:DOI` or `dcterms:URI`, it gets used as DOI or URI ## link respectively. for identifier in item.findall('.//' + DC + 'identifier'): type_ = identifier.get(XSI + 'type') if type_ == 'eidt:DOI': doi = identifier.text.encode('utf-8') doi = 'http://dx.doi.org/' + doi if type_ == 'dcterms:URI': uri = identifier.text.encode('utf-8') res['doi'] = doi res['uri'] = uri components = {} fulltext = [] preprint = None postprint = None display_request_reprint = False toc = None toc_text = None ## Every `escidocmetadataprofile:components` is extracted: for component in item.findall( './/' + ESCIDOCCOMPONENTS + 'component'): com = {} contentitem = component.find(ESCIDOCCOMPONENTS + 'content') ## * `escidoccomponents:content`'s `storage` attribute is ## checked. If the value is `external-url`, storage is ## external. Otherwise it is internal. storage = contentitem.get('storage') if storage == 'external-url': com['external'] = True elif storage == 'internal-managed': com['external'] = False else: com['external'] = False logger.warn("Unkown storage attribute: %s.", storage) ## * `escidoccomponents:content`'s `xlink:href` attribute is ## extracted for use as an url, but this is currently unused ## (the doi/uri links have taken over the role). href = contentitem.get(XLINK + 'href') if href.startswith('null'): logger.warn("href starts with 'null', converting it: %s.", href) href = href[len('null'):] if not href.startswith('http'): logger.warn("href does not start with 'http': %s.", href) # Replacing by request from Nicole Kondic. [reinout] href = 'http://coreservice.mpdl.mpg.de:8080' + href com['href'] = href ## * `prop:content-category` is extracted, as category, to find ## fulltext/preprint/postprint, and supplementary items. category = component.findtext('.//' + PROP + 'content-category') if category == 'SUPPLEMENTARY_MATERIAL': category = 'supplementary-material' com['category'] = category logger.debug("Component with category %r found.", category) if category not in (FULLTEXT_CATEGORIES + PREPRINT + POSTPRINT+ TOC_CATEGORIES): logger.debug("Non-full/pre/post/toc category found: %s", category) if components.get(category, None) is None: components[category] = [] ## * `prop:visibility` is checked. Components without a visibility ## of `public` should be ignored, except for publisher-version ## components inside an article or book-item. visibility = component.findtext('.//' + PROP + 'visibility') com['private'] = False if visibility == 'public': logger.debug("Component is public.") else: com['private'] = True if category not in PUBLISHER_VERSION: logger.debug("Private component which isn't a " "publisher-version: ignoring it.") continue # Breaks the loop. if res['publication_type'] not in ('article', 'book-item'): # Reinout says that publications that aren't articles or # book-items should be withheld when private. logger.debug("Private fulltext which isn't in an article " "or book-item: ignoring it.") continue # Breaks the loop. display_request_reprint = True # Break the loop. We don't need the actual fulltext anymore, # just the fact that we need to display the request reprint # link. logger.debug("Private AND publisher-version AND " "article/bookitem: showing 'request reprint'.") continue ## * `dc:title` is used as the component's title. com['title'] = component.findtext('.//' + DC + 'title') ## * `dc:description` is used at the component's description. com['description'] = component.findtext('.//' + DC + 'description') # Commented out mimetype as it isn't used. #com['mimetype'] = component.findtext('.//' + PROP + 'mime-type') # Finally, add the component ## * If the category is any-fulltext or publisher-version, it is ## used as the fulltext link. If there is more than one ## fulltext, all are used. pre-print and post-print are extracted ## as pre/postprint. Only one each of those is allowed. ## A table-of-contents is extracted as toc. if category in FULLTEXT_CATEGORIES: fulltext.append(com) logger.debug("Found fulltext component") elif category in PREPRINT: if preprint: logger.warn("More than one preprint link (%s), ignoring.", category) else: logger.debug("Found preprint component") preprint = com elif category in POSTPRINT: if postprint: logger.warn("More than one postprint link (%s), ignoring.", category) else: logger.debug("Found postprint component") postprint = com elif category in TOC_CATEGORIES: if toc: logger.warn("More than one toc link, ignoring.") else: logger.debug("Found TOC component") toc = com else: components[category].append(com) # res['components'] = components # ^^^ old: several categories. # New: just supplementary material. ## * If the category is supplementary material, they are stored as ## supplementary material. More than one such component can exist. res['supplementary'] = components.get('supplementary-material', []) for key in components.keys(): if key != 'supplementary-material': logger.warn("Unused component category: %s", key) res['fulltext'] = fulltext res['preprint'] = preprint res['postprint'] = postprint res['display_request_reprint'] = display_request_reprint if not toc: ## A `dcterms:TableOfContents` somewhere in the item is used ## if no table of contents component has been found earlier. toc_text = item.findtext('.//' + DCTERMS + 'tableOfContents') if toc_text: logger.debug("Found dcterms toc") res['toc'] = toc res['toc_text'] = toc_text or '' ## A section local-tags with zero or more local-tag tags in it ## is extracted. local_tags = [] for taglist in item.findall('.//local-tags'): # Should be only one, or possibly zero, but it is easiest # to just iterate over all of them. for tag in taglist.findall('./local-tag'): text = tag.findtext('./').strip() if text: local_tags.append(text) res['local_tags'] = local_tags yield res def stripped_tag(item): """Return tagname without namespace.""" return item.tag.split('}')[1] def print_documentation(): lines = (line.strip() for line in open(__file__).read().splitlines()) doclines = (line for line in lines if line.startswith('##')) doclines = (line[3:] for line in doclines) for line in doclines: print line if line.endswith('.'): print print lines = (line for line in open(__file__).read().splitlines()) special = False for line in lines: if line.startswith('#__DOCSTART__'): special = True continue if line.startswith('#__DOCEND__'): special = False continue if special: if line.startswith('# '): print print line[2:] print else: print ' ', line def main(): if len(sys.argv) > 1: source = sys.argv[1] if source == 'doc': # Special: generate parse documentation from this very file. print_documentation() return logger.info("Using %s as source instead of default %s", source, PUBMAN_URL) else: source = PUBMAN_URL logger.debug("No source specified, falling back to %s", source) result = list(parse(source)) pprint.pprint(result) if __name__ == '__main__': main()