number of times words appears Input: $string - the string to analize $words - an array consisting of word => number of times words appear if there are existing values, they will be added to. This means you can pass in a series of strings and get the overall totals */ function countWords($string,$words) { foreach(explode(' ',$string) as $word) { //print("$word
"); $word = strtolower($word); $word = str_replace(array(':', ';', ',', "\'", '"', '(', ')', '|', '/', '?', '!', '@', '#', '$', '%', '^', '&', '*', "\\", '.', '+', '=', '_', '~', '`', '"'), '', $word); /* - not removedkept out because hypens might be important probably could just focus on beginning and end of strings */ /* simple stop list */ if($word != '' && $word != 'the' && $word != 'a' && $word != 's' && $word != 'of' && $word != 'on' && $word != 'in' && $word != 'an' && $word != 'if' && $word != 'to' && $word != 'and') { $words[$word]++; } } return($words); } function launch() { global $configArray; global $interface; global $user; $interface->caching = false; if (!isset($_GET['author'])) { PEAR::raiseError(new PEAR_Error('Unknown Author')); } else { $interface->assign('author', $_GET['author']); } // Retrieve User Search History if (isset($_COOKIE['search'])) { $sHistory = unserialize($_COOKIE['search']); $lastSearch = $sHistory[count($sHistory) - 1]; $interface->assign('lastsearch', $lastSearch); } if (!$interface->is_cached('layout.tpl|Author' . $_GET['author'])) { // Clean up author string $author = $_GET['author']; if (substr($author, strlen($author) - 1, 1) == ",") { $author = substr($author, 0, strlen($author) - 1); } $author = explode(',', $author); $interface->assign('author', $author); $authornaf = $_GET['authornaf']; //We'll now search to see if we can find //a wikipedia article that seems associated with the //author by using common title words // Connect To Wikipedia if (!isset($_GET['page']) || ($_GET['page'] == 1)) { // Get records by this author $this->db = new SOLR($configArray['SOLR']['url']); $result = $this->db->query('authornaf:"' . $_GET['authornaf'] . '"', null, 0, 20); /* The result will have some information about the SOLR query and also information about each record. Issue is this is an array of arrays, unless there's only one result, then it's just an array with values */ if (is_array($result['record'][0])) { $records = $result['record']; } else if (is_array($result['record'])){ $records = array($result['record']); } $titles = array(); $words = array(); for($i = 0;$i < count($records);$i++) { $words = $this->countWords($records[$i]['title'],$words); } asort($words); /* now the words should be sorted from most frequent to least */ $words = array_keys($words); /* now we search for the author words (from earlier processing) and the two most common words. Why? Some rouging testing seem to indicate this was a good number. */ $url = "http://en.wikipedia.org/w/index.php?title=Special:Search&search=" . urlencode("$author[1] $author[0] " .array_pop($words) . " ". array_pop($words) ); //Now we examine the results. $client = new HTTP_Request(); $client->setMethod(HTTP_REQUEST_METHOD_GET); $client->setURL($url); $result = $client->sendRequest(); if (!PEAR::isError($result)) { $xmlstring = $client->getResponseBody(); } else { print("Errorerror"); } //need to suppress warnings //errors about id $xmldoc = new DOMDocument(); //see http://www.mutinydesign.co.uk/scripts/problems-encountered-with-php-dom-functions---3/ on suppressing warnings -> bad html @$xmldoc->loadHTML($xmlstring); $docXpath = new DOMXPath($xmldoc); //for some reason I haven't quite yet figured out, //registering the namespace isn't working, //the dom class seems to ignore it in the source //document $query = '/html/body/div[@id="globalWrapper"]/div[@id="column-content"]/div[@id="content"]/div[@id="bodyContent"]/ul[1]/li/a'; $links = $docXpath->query($query); $goodlink = ''; //Now, I'll iterate through the results //I'm looking for the first result that //has all the parts of the author name in it // //This could definitely be improved foreach($links as $link) { $firstname = $author[1]; $firstname = str_replace(array('.',','),'',$firstname); $firstname = trim($firstname); $lastname = $author[0]; $lastname = str_replace(array('.',','),'',$lastname); $lastname = trim($lastname); if (stripos($link->nodeValue,$firstname) > -1 && stripos($link->nodeValue,$lastname) > -1) { //print("good link
"); $goodlink = $link->attributes->getNamedItem('href')->nodeValue; break; } } $title = substr($goodlink,6); $interface->assign('info', $info); $url = 'http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=php&titles=' . $title; //if we found something, display the wikipedia info //(in final version we'd want to have something displayed // if there wasn't a match or a more strict if ($goodlink != '') { $client = new HTTP_Request(); $client->setMethod(HTTP_REQUEST_METHOD_GET); $client->setURL($url); $result = $client->sendRequest(); if (!PEAR::isError($result)) { $body = unserialize($client->getResponseBody()); //Check if data exists or not if(!$body['query']['pages']['-1']) { $body = array_shift($body['query']['pages']); $info['name'] = $body['title']; $body = array_shift($body['revisions']); $body = explode("\n", $body['*']); $done = 0; while(!$done) { if($body[0] == '') { array_shift($body); continue; } switch(substr($body[0], 0, 2)){ case "[[" : case "{{" : case "}}" : case "]]" : case "| " : //echo " sub : '" . substr($body[0], 0, 2) . "' "; $stpos = stripos($body[0], "image:"); if(!$stpos) $stpos = stripos($body[0], "image"); if($stpos) { $len = 4; $endpos = stripos($body[0], ".jpg"); if(!$endpos) { $len = 4; $endpos = stripos($body[0], ".gif"); } if($endpos) { $image = substr($body[0], $stpos, $endpos + $len - $stpos); } } array_shift($body); break; default : $done = 1; break; } } $desc = ""; $done = 0; while(!$done) { if(substr($body[0], 0, 2) == "==") $done = 1; else { $desc .= $body[0]; array_shift($body); } } //Create links to wikipedia $pattern = array(); $replacement = array(); $pattern[] = '/(\x5b\x5b)([^\x5d|]*)(\x5d\x5d)/'; $replacement[] = '$2'; $pattern[] = '/(\x5b\x5b)([^\x5d]*)\x7c([^\x5d]*)(\x5d\x5d)/'; $replacement[] = '$3'; // Removes citation $pattern[] = '/({{)[^}]*(}})/'; $replacement[] = ""; $desc = preg_replace($pattern, $replacement, $desc); $info['image'] = $image; $info['description'] = $desc; $interface->assign('info', $info); } } } } } // Get records by this author $this->db = new SOLR($configArray['SOLR']['url']); $result = $this->db->query('authornaf:"' . $_GET['authornaf'] . '"', null, 0, 20); if (isset($result['record']['id'])) { // Hack for issue with XML_Serializer $result['record'] = array($result['record']); } $interface->assign('recordSet', $result['record']); $link = (strstr($_SERVER['REQUEST_URI'], 'page=')) ? str_replace('page=' . $_GET['page'], '', $_SERVER['REQUEST_URI']) . 'page=%d' : $_SERVER['REQUEST_URI'] . '&page=%d'; $options = array('totalItems' => $result['RecordCount'], 'mode' => 'sliding', 'path' => '', 'fileName' => $link, 'delta' => 5, 'perPage' => 20, 'nextImg' => 'Next »', 'prevImg' => '« Prev', 'separator' => '', 'spacesBeforeSeparator' => 0, 'spacesAfterSeparator' => 0, 'append' => false, 'clearIfVoid' => true, 'urlVar' => 'page', 'curPageSpanPre' => '', 'curPageSpanPost' => ''); $pager =& Pager::factory($options); $interface->assign('pager', $pager); $interface->setTemplate('home.tpl'); $interface->display('layout.tpl', 'Author' . $_GET['author']); } } ?>