Server : Apache/2.4.41 (Ubuntu) System : Linux journalup 5.4.0-198-generic #218-Ubuntu SMP Fri Sep 27 20:18:53 UTC 2024 x86_64 User : www-data ( 33) PHP Version : 7.4.33 Disable Function : pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare, Directory : /var/www/html/lib/pkp/classes/search/ |
<?php /** * @file classes/search/SubmissionSearchIndex.inc.php * * Copyright (c) 2014-2020 Simon Fraser University * Copyright (c) 2003-2020 John Willinsky * Distributed under the GNU GPL v3. For full terms see the file docs/COPYING. * * @class SubmissionSearchIndex * @ingroup search * * @brief Class to maintain a submission search index. */ import('lib.pkp.classes.search.SearchFileParser'); import('lib.pkp.classes.search.SearchHTMLParser'); import('lib.pkp.classes.search.SearchHelperParser'); define('SEARCH_STOPWORDS_FILE', 'lib/pkp/registry/stopwords.txt'); // Words are truncated to at most this length define('SEARCH_KEYWORD_MAX_LENGTH', 40); abstract class SubmissionSearchIndex { /** * Split a string into a clean array of keywords * @param $text string * @param $allowWildcards boolean * @return array of keywords */ public function filterKeywords($text, $allowWildcards = false) { $minLength = Config::getVar('search', 'min_word_length'); $stopwords = $this->_loadStopwords(); // Join multiple lines into a single string if (is_array($text)) $text = join("\n", $text); // Remove punctuation $text = PKPString::regexp_replace('/[!"\#\$%\'\(\)\.\?@\[\]\^`\{\}~]/', '', $text); $text = PKPString::regexp_replace('/[\+,:;&\/<=>\|\\\]/', ' ', $text); $text = PKPString::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $text); $text = PKPString::strtolower($text); // Split into words $words = PKPString::regexp_split('/\s+/', $text); // FIXME Do not perform further filtering for some fields, e.g., author names? // Remove stopwords $keywords = array(); foreach ($words as $k) { if (!isset($stopwords[$k]) && PKPString::strlen($k) >= $minLength && !is_numeric($k)) { $keywords[] = PKPString::substr($k, 0, SEARCH_KEYWORD_MAX_LENGTH); } } return $keywords; } /** * Return list of stopwords. * FIXME: Should this be locale-specific? * @return array with stopwords as keys */ protected function _loadStopwords() { static $searchStopwords; if (!isset($searchStopwords)) { // Load stopwords only once per request $searchStopwords = array_count_values( array_filter( array_map('trim', file(SEARCH_STOPWORDS_FILE)), function($a) { return !empty($a) && $a[0] != '#'; } ) ); $searchStopwords[''] = 1; } return $searchStopwords; } /** * Let the indexing back-end know that the current transaction * finished so that the index can be batch-updated. */ abstract function submissionChangesFinished(); /** * Signal to the indexing back-end that the metadata of a submission * changed. * * Push indexing implementations will try to immediately update * the index to reflect the changes. Pull implementations will * mark articles as "changed" and let the indexing back-end decide * the best point in time to actually index the changed data. * * @param $submission Submission */ abstract public function submissionMetadataChanged($submission); /** * Remove indexed file contents for a submission * @param $submission Submission */ abstract function clearSubmissionFiles($submission); }