import logging
import os
import re
import pprint
import shutil
import sys
import urllib

logger = logging.getLogger('pubman')
#logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.DEBUG)

"""
The etree importing try/except handling was commented out by Maurits
on 11 March 2009 after Mark got a test failure which was very likely
due to having a newer lxml on his machine.  The server at machine
currently does not have lxml installed so it uses another (slightly
slower) standard implementation.  We could put lxml in the buildout,
but that has given us problems in the past, at least on Apple Macs.
So we just pick one import from the tree below.

Note that speed is not really important for us as this is only used in
a nightly cron job.

Actually, since elementtree is not always available, and other users
(at puman) may use python2.5, we only do not try to import lxml.


# etree import routine from the website. This makes sure it can be loaded
# everywhere, falling back to ever slower implementations.
# try:
#     from lxml import etree
#     logger.debug("running with lxml.etree")
# except ImportError:
"""
try:
    # Python 2.5
    import xml.etree.cElementTree as etree
    logger.debug("running with cElementTree on Python 2.5+")
except ImportError:
    try:
        # Python 2.5
        import xml.etree.ElementTree as etree
        logger.debug("running with ElementTree on Python 2.5+")
    except ImportError:
        try:
            # normal cElementTree install
            import cElementTree as etree
            logger.debug("running with cElementTree")
        except ImportError:
            try:
                # normal ElementTree install
                import elementtree.ElementTree as etree
                logger.debug("running with ElementTree")
            except ImportError:
                logger.critical("Failed to import ElementTree from any "
                                "known place")
                sys.exit(1)


DCTERMS = '{http://purl.org/dc/terms/}'
DC = '{http://purl.org/dc/elements/1.1/}'
PUBLICATION = '{http://escidoc.mpg.de/metadataprofile/schema/0.1/publication}'
ESCIDOC = '{http://escidoc.mpg.de/metadataprofile/schema/0.1/types}'
ESCIDOCCOMPONENTS = '{http://www.escidoc.de/schemas/components/0.7}'
ESCIDOCMETADATAPROFILE = '{http://escidoc.mpg.de/metadataprofile/schema/0.1/}'
XLINK = '{http://www.w3.org/1999/xlink}'
PROP = '{http://escidoc.de/core/01/properties/}'
XSI = '{http://www.w3.org/2001/XMLSchema-instance}'

# Get a file to put the source in; partly this is so we can
# look at that file afterwards, partly it is because on the server
# an elementtree package is used that does not know how to
# download stuff.
target_dir = os.environ.get('CLIENT_HOME', os.getcwd())
TARGET_FILE_NAME = os.path.join(target_dir, 'latest_pubman_export.xml')

# Location to use as default source of the xml.
#
# Note: this is ONLY used when the pubman.py script is called directly
# from the command line (so not within Plone) when no specific source
# is specified.  The ONLY time it is used from within Plone is during
# install/upgrade when setting a property in
# portal_properties/pubman_settings when that property does not exist
# yet.  If the property exists, the url here is NOT used in Plone.
#
# Note: this must NOT contain spaces (replace them by %20)
PUBMAN_URL = ('http://pubman.mpdl.mpg.de/search/SearchAndExport?'
        'cqlQuery=escidoc.content-model.objid=escidoc:persistent4%20'
        'AND%20escidoc.context.objid%20any%20"escidoc:54203%20escidoc:61348%20'
        'escidoc:57277"&exportFormat=APA&outputFormat=snippet&language=all&'
        'sortKeys=&sortOrder=ascending&startRecord=&maximumRecords=')

#__DOCSTART__ <= Comment for documentation generation.
# PUBLISHER_VERSION private components result in a 'request reprint' link.
PUBLISHER_VERSION = ('PUBLISHER_VERSION', 'publisher-version')
# FULLTEXT_CATEGORIES components are shown as the full text links.
FULLTEXT_CATEGORIES = PUBLISHER_VERSION + ('ANY_FULLTEXT', 'any-fulltext')
# PREPRINT, POSTPRINT components are shown as pre/post print links.
PREPRINT = ('PRE_PRINT', 'pre-print')
POSTPRINT = ('POST_PRINT', 'post-print')
# TOC_CATEGORIES components are shown as table of contents links.
TOC_CATEGORIES = ('TOC', 'TABLE_OF_CONTENTS',
                  'toc', 'table-of-contents')
#__DOCEND__ <= The stuff above is returned by ``pubman.py doc``.


def split_subjects(text):
    """Return cleaned-up subjects from long string"""
    if not text:
        return ''
    bits = text.split(';')
    smallbits = []
    for bit in bits:
        smallbits += bit.split(',')
    smallbits = [smallbit.lower().strip() for smallbit in smallbits]
    # Filter out empty ones
    smallbits = [smallbit.encode('utf-8') for smallbit in smallbits
                 if smallbit]
    return smallbits


def extract_groups(name):
    """Extract groups, if possible.

    Spit out all comma-separated individual values.
    """
    return (item.strip() for item in name.split(','))


def extract_year_or_status(citation):
    """Extract either the year or the status from the formatted citation.

    The year or status is enclosed by a <span> with a special class::

      >>> cit = 'Some (<span class="DisplayDateStatus">2008</span>). Thing'
      >>> cit2 = 'Some (<span class="DisplayDateStatus">In print</span>). Thing'
      >>> junk = 'Some ( <span class="DisplayDateStatus"> (2008). </span> ). Thing'
      >>> nothing = 'Something without <span class="hurray">a</span> match.'
      >>> nested = '<span>Some (<span class="DisplayDateStatus"> 2007 </span>). </span>'

    When there's no match, we return 'Unknown'::

      >>> extract_year_or_status(nothing)
      'Unknown'
      >>> extract_year_or_status(None)
      'Unknown'

    Years and statuses are extracted when present::

      >>> extract_year_or_status(cit)
      '2008'
      >>> extract_year_or_status(cit2)
      'In print'

    And also when nested in another span""

      >>> extract_year_or_status(nested)
      '2007'

    And spaces, dots and parentheses get stripped::

      >>> extract_year_or_status(junk)
      '2008'


    """
    default = 'Unknown'
    if citation is None:
        return default
    pattern = re.compile(r"""
    <span\W+                   # Span tag plus whitespace
    class="DisplayDateStatus"  # The class we're looking for
    \W*>                       # Optional whitespace and closing >
    (?P<status>.*?)            # The actual status, non-greedy
    </span>                    # Closing tag
    """, re.VERBOSE)
    match = pattern.search(citation)
    if match:
        year_or_status = match.group('status').strip()
        logger.debug("Found year or status: %r", year_or_status)
        year_or_status = year_or_status.replace('(', '')
        year_or_status = year_or_status.replace(')', '')
        year_or_status = year_or_status.replace('.', '')
        return year_or_status
    return default


def download_source(source):
    """Download (or copy) the source and put it in the TARGET_FILE_NAME

    source can be a file url, http(s) url, or a path on the file
    system.

    For ease of use by others it returns the path where the source has
    ended up, which will always be TARGET_FILE_NAME, unless something
    went wrong.  This can be handy if you want to parse the source
    once for validation and then a second time for importing it in
    e.g. a Plone website.
    """
    # We may be reusing the old file as the new file.
    if source == TARGET_FILE_NAME:
        pass
    elif source == 'file://' + TARGET_FILE_NAME:
        source = TARGET_FILE_NAME
    elif os.path.exists(TARGET_FILE_NAME):
        # Remove old file.
        os.unlink(TARGET_FILE_NAME)

    if source[:4] in ('file', 'http'):
        target_file = open(TARGET_FILE_NAME, 'w')
        try:
            logger.debug("Downloading %s to %s", source, TARGET_FILE_NAME)
            target_file.write(urllib.urlopen(source).read())
        except IOError:
            # log the error and reraise
            logger.error("url not found: %r." % source)
            raise
        target_file.close()
    else:
        if source != TARGET_FILE_NAME:
            logger.debug("Copying %s to %s", source, TARGET_FILE_NAME)
            shutil.copy(source, TARGET_FILE_NAME)
    return TARGET_FILE_NAME


def parse(source):
    logger.debug("Parsing %s", source)

    target = download_source(source)
    if target is None:
        return

    tree = etree.parse(TARGET_FILE_NAME)
    root = tree.getroot()
    logger.info("%s items found.", len(root))
    # Note: every comment line starting with '##' is extracted when running
    # `python pubman.py doc`. It is used to document which fields or
    # attributes are extracted from the xml file. So keep the double hashes in
    # place and keep the documentation up-to-date.

    ## Every item in the root is parsed. We check if it is a escidocItem:item.
    for item in root:
        res = {}
        # Assumption: it is a list of items.
        assert stripped_tag(item) == 'item'
        ## Attribute `objid` = interal id of the item.
        res['id'] = item.get('objid')
        logger.debug('===================================')
        logger.info("Extracting %s", res['id'])
        ## Attribute `last-modification-date` = last modified date.
        res['modified'] = item.get('last-modification-date')

        ## A `dcterms:bibliographicCitation` somewhere in the item is used as
        ## the formatted citation. Any `<br/>` tag is stripped out.
        citation = item.findtext('.//' + DCTERMS + 'bibliographicCitation')
        if citation:
            citation = citation.encode('utf-8')
            citation = citation.replace('<br/>', '')
        res['formatted_citation'] = citation
        ## From this formatted citation, the year (`2007`) or status (`in
        ## print`) is extracted.
        res['year_or_status'] = extract_year_or_status(
            res['formatted_citation']).encode('utf-8')

        ## `dc:title` = title.
        title = item.findtext('.//' + DC + 'title')
        if title:
            title = title.encode('utf-8')
        res['title'] = title
        ## `dc:subjects` is split on both comma and semicolon. The resulting
        ## words are used as keywords.
        keywords = item.findtext(
            './/' + DC + 'subject')
        res['keywords'] = split_subjects(keywords)

        authors = []
        groups = []
        ## Inside an item, every `publication:creator` is extracted.
        for creator in item.findall('.//' + PUBLICATION + 'creator'):
            ## * Every creator without an attribute role="author" or "editor" is ignored.
            if not creator.get('role') in ('author', 'editor'):
                logger.debug("Found creator without author/editor role: %s.",
                             creator.get('role'))
                continue
            ## * Within a creator, every `escidoc:person` is extracted.
            for author in creator.findall(ESCIDOC + 'person'):
                full = author.findtext(
                    './' + ESCIDOC + 'complete-name').strip()
                family = author.findtext(
                    './' + ESCIDOC + 'family-name').strip()
                given = author.findtext(
                    './' + ESCIDOC + 'given-name').strip()
                ## * The person's `complete-name` is used if available,
                ##   otherwise `family-name` and `given-name` are
                ##   concatenated.
                if not full:
                    full = u' '.join([given, family])
                full = full.encode('utf-8')
                authors.append(full)
                ## * For every person, all `escidoc:organization`s are found
                ##   and their `escidoc:organization-name` extracted. These
                ##   organization names are addedto the list of groups
                ##   of the publication.
                for org in author.findall(
                    './' + ESCIDOC + 'organization'):
                    name = org.findtext(
                        './' + ESCIDOC + 'organization-name')
                    if name:
                        for group in extract_groups(name):
                            group = group.encode('utf-8')
                            if group not in groups:
                                groups.append(group)
        ## On the item, the first `escidocmetadataprofile:publication` is
        ## extracted,
        pubdata = item.findall('.//' + ESCIDOCMETADATAPROFILE + 'publication')
        if pubdata:
            pubdata = pubdata[0]
            ## the publication's `type` attribute  becomes the
            ## publication_type.
            publication_type = pubdata.get('type')
            # ^^^ Non-namespaced at the moment.
            res['publication_type'] = publication_type.encode('utf-8')
        res['authors'] = authors
        res['groups'] = groups
        ## `dcterms:created` becomes the creation date. If it contains just a
        ## year ('2003'), it gets converted to ('2003-01-01').
        created = item.findtext('.//' + DCTERMS + 'created')
        if created:
            if len(created) == 4:
                # Just '2003'.
                created = created + '-01-01'
        res['created'] = created

        ## The abstract is extracted from `dcterms:abstract`.
        abstract = item.findtext('.//' + DCTERMS + 'abstract')
        if abstract:
            abstract = abstract.encode('utf-8')
        res['abstract'] = abstract

        # Extract uri and doi links
        doi = None
        uri = None
        ## Every `dc:identifier` is extracted. If the `xsi:type` attribute
        ## matches `eidt:DOI` or `dcterms:URI`, it gets used as DOI or URI
        ## link respectively.
        for identifier in item.findall('.//' + DC + 'identifier'):
            type_ = identifier.get(XSI + 'type')
            if type_ == 'eidt:DOI':
                doi = identifier.text.encode('utf-8')
                doi = 'http://dx.doi.org/' + doi
            if type_ == 'dcterms:URI':
                uri = identifier.text.encode('utf-8')
        res['doi'] = doi
        res['uri'] = uri

        components = {}
        fulltext = []
        preprint = None
        postprint = None
        display_request_reprint = False
        toc = None
        toc_text = None

        ## Every `escidocmetadataprofile:components` is extracted:
        for component in item.findall(
            './/' + ESCIDOCCOMPONENTS + 'component'):
            com = {}
            contentitem = component.find(ESCIDOCCOMPONENTS + 'content')
            ## * `escidoccomponents:content`'s `storage` attribute is
            ##   checked. If the value is `external-url`, storage is
            ##   external. Otherwise it is internal.
            storage = contentitem.get('storage')
            if storage == 'external-url':
                com['external'] = True
            elif storage == 'internal-managed':
                com['external'] = False
            else:
                com['external'] = False
                logger.warn("Unkown storage attribute: %s.", storage)
            ## * `escidoccomponents:content`'s `xlink:href` attribute is
            ##   extracted for use as an url, but this is currently unused
            ##   (the doi/uri links have taken over the role).
            href = contentitem.get(XLINK + 'href')
            if href.startswith('null'):
                logger.warn("href starts with 'null', converting it: %s.",
                               href)
                href = href[len('null'):]
            if not href.startswith('http'):
                logger.warn("href does not start with 'http': %s.", href)
                # Replacing by request from Nicole Kondic. [reinout]
                href = 'http://coreservice.mpdl.mpg.de:8080' + href
            com['href'] = href

            ## * `prop:content-category` is extracted, as category, to find
            ##   fulltext/preprint/postprint, and supplementary items.
            category = component.findtext('.//' + PROP + 'content-category')
            if category == 'SUPPLEMENTARY_MATERIAL':
                category = 'supplementary-material'
            com['category'] = category
            logger.debug("Component with category %r found.", category)
            if category not in (FULLTEXT_CATEGORIES + PREPRINT + POSTPRINT+
                                TOC_CATEGORIES):
                logger.debug("Non-full/pre/post/toc category found: %s",
                             category)
                if components.get(category, None) is None:
                    components[category] = []

            ## * `prop:visibility` is checked. Components without a visibility
            ##   of `public` should be ignored, except for publisher-version
            ##   components inside an article or book-item.
            visibility = component.findtext('.//' + PROP + 'visibility')
            com['private'] = False
            if visibility == 'public':
                logger.debug("Component is public.")
            else:
                com['private'] = True
                if category not in PUBLISHER_VERSION:
                    logger.debug("Private component which isn't a "
                                 "publisher-version: ignoring it.")
                    continue # Breaks the loop.
                if res['publication_type'] not in ('article', 'book-item'):
                    # Reinout says that publications that aren't articles or
                    # book-items should be withheld when private.
                    logger.debug("Private fulltext which isn't in an article "
                                 "or book-item: ignoring it.")
                    continue # Breaks the loop.
                display_request_reprint = True
                # Break the loop. We don't need the actual fulltext anymore,
                # just the fact that we need to display the request reprint
                # link.
                logger.debug("Private AND publisher-version AND "
                             "article/bookitem: showing 'request reprint'.")
                continue

            ## * `dc:title` is used as the component's title.
            com['title'] = component.findtext('.//' + DC + 'title')
            ## * `dc:description` is used at the component's description.
            com['description'] = component.findtext('.//' + DC + 'description')

            # Commented out mimetype as it isn't used.
            #com['mimetype'] = component.findtext('.//' + PROP + 'mime-type')

            # Finally, add the component
            ## * If the category is any-fulltext or publisher-version, it is
            ##   used as the fulltext link. If there is more than one
            ##   fulltext, all are used. pre-print and post-print are extracted
            ##   as pre/postprint. Only one each of those is allowed.
            ##   A table-of-contents is extracted as toc.
            if category in FULLTEXT_CATEGORIES:
                fulltext.append(com)
                logger.debug("Found fulltext component")
            elif category in PREPRINT:
                if preprint:
                    logger.warn("More than one preprint link (%s), ignoring.",
                                category)
                else:
                    logger.debug("Found preprint component")
                    preprint = com
            elif category in POSTPRINT:
                if postprint:
                    logger.warn("More than one postprint link (%s), ignoring.",
                                category)
                else:
                    logger.debug("Found postprint component")
                    postprint = com
            elif category in TOC_CATEGORIES:
                if toc:
                    logger.warn("More than one toc link, ignoring.")
                else:
                    logger.debug("Found TOC component")
                    toc = com
            else:
                components[category].append(com)

        # res['components'] = components
        # ^^^ old: several categories.
        # New: just supplementary material.
        ## * If the category is supplementary material, they are stored as
        ##   supplementary material. More than one such component can exist.
        res['supplementary'] = components.get('supplementary-material', [])
        for key in components.keys():
            if key != 'supplementary-material':
                logger.warn("Unused component category: %s", key)
        res['fulltext'] = fulltext
        res['preprint'] = preprint
        res['postprint'] = postprint
        res['display_request_reprint'] = display_request_reprint
        if not toc:
            ## A `dcterms:TableOfContents` somewhere in the item is used
            ## if no table of contents component has been found earlier.
            toc_text = item.findtext('.//' + DCTERMS + 'tableOfContents')
            if toc_text:
                logger.debug("Found dcterms toc")
        res['toc'] = toc
        res['toc_text'] = toc_text or ''

        ## A section local-tags with zero or more local-tag tags in it
        ## is extracted.
        local_tags = []
        for taglist in item.findall('.//local-tags'):
            # Should be only one, or possibly zero, but it is easiest
            # to just iterate over all of them.
            for tag in taglist.findall('./local-tag'):
                text = tag.findtext('./').strip()
                if text:
                    local_tags.append(text)
        res['local_tags'] = local_tags


        yield res


def stripped_tag(item):
    """Return tagname without namespace."""
    return item.tag.split('}')[1]


def print_documentation():
    lines = (line.strip() for line in open(__file__).read().splitlines())
    doclines = (line for line in lines if line.startswith('##'))
    doclines = (line[3:] for line in doclines)
    for line in doclines:
        print line
        if line.endswith('.'):
            print
    print
    lines = (line for line in open(__file__).read().splitlines())
    special = False
    for line in lines:
        if line.startswith('#__DOCSTART__'):
            special = True
            continue
        if line.startswith('#__DOCEND__'):
            special = False
            continue
        if special:
            if line.startswith('# '):
                print
                print line[2:]
                print
            else:
                print ' ', line


def main():
    if len(sys.argv) > 1:
        source = sys.argv[1]
        if source == 'doc':
            # Special: generate parse documentation from this very file.
            print_documentation()
            return
        logger.info("Using %s as source instead of default %s",
                    source, PUBMAN_URL)
    else:
        source = PUBMAN_URL
        logger.debug("No source specified, falling back to %s", source)
    result = list(parse(source))
    pprint.pprint(result)


if __name__ == '__main__':
    main()