KUNTUL | JINGKONTOT
JINGKONTOT


Server : Apache/2.4.41 (Ubuntu)
System : Linux journalup 5.4.0-198-generic #218-Ubuntu SMP Fri Sep 27 20:18:53 UTC 2024 x86_64
User : www-data ( 33)
PHP Version : 7.4.33
Disable Function : pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare,
Directory :  /var/www/html/lib/pkp/classes/search/

Upload File :
current_dir [ Writeable ] document_root [ Writeable ]

 

Current File : /var/www/html/lib/pkp/classes/search/SubmissionSearchIndex.inc.php
<?php

/**
 * @file classes/search/SubmissionSearchIndex.inc.php
 *
 * Copyright (c) 2014-2020 Simon Fraser University
 * Copyright (c) 2003-2020 John Willinsky
 * Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
 *
 * @class SubmissionSearchIndex
 * @ingroup search
 *
 * @brief Class to maintain a submission search index.
 */

import('lib.pkp.classes.search.SearchFileParser');
import('lib.pkp.classes.search.SearchHTMLParser');
import('lib.pkp.classes.search.SearchHelperParser');

define('SEARCH_STOPWORDS_FILE', 'lib/pkp/registry/stopwords.txt');

// Words are truncated to at most this length
define('SEARCH_KEYWORD_MAX_LENGTH', 40);

abstract class SubmissionSearchIndex {
	/**
	 * Split a string into a clean array of keywords
	 * @param $text string
	 * @param $allowWildcards boolean
	 * @return array of keywords
	 */
	public function filterKeywords($text, $allowWildcards = false) {
		$minLength = Config::getVar('search', 'min_word_length');
		$stopwords = $this->_loadStopwords();

		// Join multiple lines into a single string
		if (is_array($text)) $text = join("\n", $text);

		// Remove punctuation
		$text = PKPString::regexp_replace('/[!"\#\$%\'\(\)\.\?@\[\]\^`\{\}~]/', '', $text);
		$text = PKPString::regexp_replace('/[\+,:;&\/<=>\|\\\]/', ' ', $text);
		$text = PKPString::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $text);
		$text = PKPString::strtolower($text);

		// Split into words
		$words = PKPString::regexp_split('/\s+/', $text);

		// FIXME Do not perform further filtering for some fields, e.g., author names?

		// Remove stopwords
		$keywords = array();
		foreach ($words as $k) {
			if (!isset($stopwords[$k]) && PKPString::strlen($k) >= $minLength && !is_numeric($k)) {
				$keywords[] = PKPString::substr($k, 0, SEARCH_KEYWORD_MAX_LENGTH);
			}
		}
		return $keywords;
	}

	/**
	 * Return list of stopwords.
	 * FIXME: Should this be locale-specific?
	 * @return array with stopwords as keys
	 */
	protected function _loadStopwords() {
		static $searchStopwords;

		if (!isset($searchStopwords)) {
			// Load stopwords only once per request
			$searchStopwords = array_count_values(
				array_filter(
					array_map('trim', file(SEARCH_STOPWORDS_FILE)),
					function($a) {
						return !empty($a) && $a[0] != '#';
					}
				)
			);
			$searchStopwords[''] = 1;
		}

		return $searchStopwords;
	}

	/**
	 * Let the indexing back-end know that the current transaction
	 * finished so that the index can be batch-updated.
	 */
	abstract function submissionChangesFinished();

	/**
	 * Signal to the indexing back-end that the metadata of a submission
	 * changed.
	 *
	 * Push indexing implementations will try to immediately update
	 * the index to reflect the changes. Pull implementations will
	 * mark articles as "changed" and let the indexing back-end decide
	 * the best point in time to actually index the changed data.
	 *
	 * @param $submission Submission
	 */
	abstract public function submissionMetadataChanged($submission);

	/**
	 * Remove indexed file contents for a submission
	 * @param $submission Submission
	 */
	abstract function clearSubmissionFiles($submission);
}


KUNTUL | JINGKONTOT |