--- /home/jtg/vufind_remote/releases/VuFind-0.7/solr/conf/schema.xml 2008-01-12 11:55:14.000000000 -0600 +++ /home/jtg/vufind/solr/conf/schema.xml 2008-03-04 20:52:01.000000000 -0600 @@ -59,6 +59,7 @@ + --- /home/jtg/vufind_remote/releases/VuFind-0.7/import/marcxml2solr.xsl 2008-01-12 11:55:15.000000000 -0600 +++ /home/jtg/vufind/import/marcxml2solr.xsl 2008-03-04 20:59:48.000000000 -0600 @@ -78,6 +78,16 @@ + + + + + + + + + + --- /home/jtg/vufind_remote/releases/VuFind-0.7/web/services/Record/xsl/record-html.xsl 2008-01-12 11:55:16.000000000 -0600 +++ /home/jtg/vufind/web/services/Record/xsl/record-html.xsl 2008-03-11 20:02:10.000000000 -0500 @@ -83,12 +83,21 @@ : - /Author/Home?author= - - + + + /Author/Home?authornaf= + + + + + &author= + + + + - + @@ -401,4 +410,4 @@ - \ No newline at end of file + --- /home/jtg/vufind_remote/releases/VuFind-0.7/web/services/Author/Home.php 2008-01-12 11:55:15.000000000 -0600 +++ /home/jtg/vufind/web/services/Author/Home.php 2008-03-04 21:57:27.000000000 -0600 @@ -26,6 +26,67 @@ class Home extends Action { private $db; + /* + Return: array consisting of word => number of times words appears + Input: $string - the string to analize + $words - an array consisting of word => number of times words appear + if there are existing values, they will be added to. + This means you can pass in a series of strings and + get the overall totals + + */ + function countWords($string,$words) { + foreach(explode(' ',$string) as $word) { + //print("$word
"); + $word = strtolower($word); + $word = str_replace(array(':', + ';', + ',', + "\'", + '"', + '(', + ')', + '|', + '/', + '?', + '!', + '@', + '#', + '$', + '%', + '^', + '&', + '*', + "\\", + '.', + '+', + '=', + '_', + '~', + '`', + '"'), + '', + $word); + /* - not removedkept out because hypens might be important probably could just focus on beginning and end of strings */ + /* simple stop list */ + if($word != '' && + $word != 'the' && + $word != 'a' && + $word != 's' && + $word != 'of' && + $word != 'on' && + $word != 'in' && + $word != 'an' && + $word != 'if' && + $word != 'to' && + $word != 'and') { + $words[$word]++; + } + } + return($words); + } + + function launch() { @@ -49,46 +110,160 @@ } if (!$interface->is_cached('layout.tpl|Author' . $_GET['author'])) { - // Clean up author string +// Clean up author string $author = $_GET['author']; if (substr($author, strlen($author) - 1, 1) == ",") { $author = substr($author, 0, strlen($author) - 1); } $author = explode(',', $author); $interface->assign('author', $author); + + $authornaf = $_GET['authornaf']; + //We'll now search to see if we can find + //a wikipedia article that seems associated with the + //author by using common title words + // Connect To Wikipedia if (!isset($_GET['page']) || ($_GET['page'] == 1)) { - $url = 'http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=php&titles=' . urlencode("$author[1] $author[0]"); - $client = new HTTP_Request(); - $client->setMethod(HTTP_REQUEST_METHOD_GET); - $client->setURL($url); - $result = $client->sendRequest(); - if (!PEAR::isError($result)) { - $body = unserialize($client->getResponseBody()); - - //Check if data exists or not - if(!$body['query']['pages']['-1']) { - $body = array_shift($body['query']['pages']); - $info['name'] = $body['title']; - $body = array_shift($body['revisions']); - $body = explode("\n", $body['*']); - - $done = 0; + // Get records by this author + $this->db = new SOLR($configArray['SOLR']['url']); + $result = $this->db->query('authornaf:"' . $_GET['authornaf'] . '"', null, 0, 20); + + /* The result will have some information about + the SOLR query and also information about + each record. Issue is this is an array of arrays, + unless there's only one result, then it's just + an array with values */ + + if (is_array($result['record'][0])) { + $records = $result['record']; + } + else if (is_array($result['record'])){ + $records = array($result['record']); + } + + $titles = array(); + $words = array(); + + for($i = 0;$i < count($records);$i++) { + $words = $this->countWords($records[$i]['title'],$words); + } + + + asort($words); + + /* now the words should be sorted from most frequent to least */ + $words = array_keys($words); + + /* now we search for the author words (from + earlier processing) and the two most common + words. Why? Some rouging testing seem to + indicate this was a good number. */ + $url = "http://en.wikipedia.org/w/index.php?title=Special:Search&search=" . urlencode("$author[1] $author[0] " .array_pop($words) . " ". array_pop($words) ); + + //Now we examine the results. + + $client = new HTTP_Request(); + $client->setMethod(HTTP_REQUEST_METHOD_GET); + $client->setURL($url); + $result = $client->sendRequest(); + if (!PEAR::isError($result)) { + $xmlstring = $client->getResponseBody(); + } + else { + print("Errorerror"); + } + + //need to suppress warnings + //errors about id + $xmldoc = new DOMDocument(); + + //see http://www.mutinydesign.co.uk/scripts/problems-encountered-with-php-dom-functions---3/ on suppressing warnings -> bad html + @$xmldoc->loadHTML($xmlstring); + + $docXpath = new DOMXPath($xmldoc); + + //for some reason I haven't quite yet figured out, + //registering the namespace isn't working, + //the dom class seems to ignore it in the source + //document + $query = '/html/body/div[@id="globalWrapper"]/div[@id="column-content"]/div[@id="content"]/div[@id="bodyContent"]/ul[1]/li/a'; + + $links = $docXpath->query($query); + $goodlink = ''; + + //Now, I'll iterate through the results + //I'm looking for the first result that + //has all the parts of the author name in it + // + //This could definitely be improved + foreach($links as $link) { + + + $firstname = $author[1]; + $firstname = str_replace(array('.',','),'',$firstname); + $firstname = trim($firstname); + + + $lastname = $author[0]; + $lastname = str_replace(array('.',','),'',$lastname); + $lastname = trim($lastname); + + if (stripos($link->nodeValue,$firstname) > -1 && + stripos($link->nodeValue,$lastname) > -1) + { + + //print("good link
"); + $goodlink = $link->attributes->getNamedItem('href')->nodeValue; + break; + + } + } + + $title = substr($goodlink,6); + + $interface->assign('info', $info); + + $url = 'http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=php&titles=' . $title; + + + + //if we found something, display the wikipedia info + //(in final version we'd want to have something displayed + // if there wasn't a match or a more strict + if ($goodlink != '') { + $client = new HTTP_Request(); + $client->setMethod(HTTP_REQUEST_METHOD_GET); + $client->setURL($url); + $result = $client->sendRequest(); + if (!PEAR::isError($result)) { + $body = unserialize($client->getResponseBody()); + + + //Check if data exists or not + if(!$body['query']['pages']['-1']) { + $body = array_shift($body['query']['pages']); + $info['name'] = $body['title']; + + $body = array_shift($body['revisions']); + $body = explode("\n", $body['*']); + + $done = 0; while(!$done) { - if($body[0] == '') { - array_shift($body); - continue; - } - switch(substr($body[0], 0, 2)){ + if($body[0] == '') { + array_shift($body); + continue; + } + switch(substr($body[0], 0, 2)){ case "[[" : - case "{{" : - case "}}" : + case "{{" : + case "}}" : case "]]" : - case "| " : - //echo " sub : '" . substr($body[0], 0, 2) . "' "; - $stpos = stripos($body[0], "image:"); + case "| " : + //echo " sub : '" . substr($body[0], 0, 2) . "' "; + $stpos = stripos($body[0], "image:"); if(!$stpos) $stpos = stripos($body[0], "image"); if($stpos) { @@ -140,16 +315,17 @@ $info['image'] = $image; $info['description'] = $desc; - $interface->assign('info', $info); + $interface->assign('info', $info); } - } + } + } } } // Get records by this author $this->db = new SOLR($configArray['SOLR']['url']); - $result = $this->db->query('author:"' . $_GET['author'] . '"', null, 0, 20); + $result = $this->db->query('authornaf:"' . $_GET['authornaf'] . '"', null, 0, 20); if (isset($result['record']['id'])) { // Hack for issue with XML_Serializer $result['record'] = array($result['record']);