Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 33 additions & 13 deletions src/Indexer/TNTIndexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,15 @@ public function createIndex($indexName)
term_id INTEGER,
doc_id INTEGER,
field_id INTEGER,
field_len INTEGER,
position INTEGER,
hit_count INTEGER)");

$this->index->exec("CREATE TABLE IF NOT EXISTS docinfo (
doc_id INTEGER,
field_id INTEGER,
num_terms INTEGER)");

$this->index->exec("CREATE TABLE IF NOT EXISTS info (
key TEXT,
value INTEGER)");
Expand Down Expand Up @@ -455,6 +461,7 @@ public function saveToIndex($stems, $docId)
$terms = $this->saveWordlist($stems);
$this->saveDoclist($terms, $docId);
$this->saveHitList($stems, $docId, $terms);
$this->saveDocInfo($stems, $docId);
}

/**
Expand Down Expand Up @@ -538,33 +545,46 @@ public function saveDoclist($terms, $docId)

public function saveHitList($stems, $docId, $termsList)
{
return;
$fieldCounter = 0;
$fields = [];

$insert = "INSERT INTO hitlist (term_id, doc_id, field_id, position, hit_count)
VALUES (:term_id, :doc_id, :field_id, :position, :hit_count)";
$insert = "INSERT INTO hitlist (term_id, doc_id, field_id, field_len, hit_count)
VALUES (:term_id, :doc_id, :field_id, :field_len, :hit_count)";
$stmt = $this->index->prepare($insert);

foreach ($stems as $field => $terms) {
$fields[$fieldCounter] = $field;
$positionCounter = 0;
$termCounts = array_count_values($terms);
foreach ($terms as $term) {
if (isset($termsList[$term])) {
$stmt->bindValue(':term_id', $termsList[$term]['id']);
$stmt->bindValue(':doc_id', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':position', $positionCounter);
$stmt->bindValue(':hit_count', $termCounts[$term]);
$stmt->execute();
}
$positionCounter++;
$field_len = count($terms);
foreach ($termCounts as $term => $hitCount) {
$stmt->bindValue(':term_id', $termsList[$term]['id']);
$stmt->bindValue(':doc_id', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':field_len', $field_len);
$stmt->bindValue(':hit_count', $termCounts[$term]);
$stmt->execute();
}
$fieldCounter++;
}
}

public function saveDocInfo($stems, $docId)
{
$fieldCounter = 0;
foreach ($stems as $field => $terms) {
$numTerms = count($terms);

$insert = "INSERT INTO docinfo (doc_id, field_id, num_terms) VALUES (:doc, :field_id, :num_terms)";
$stmt = $this->index->prepare($insert);
$stmt->bindValue(':doc', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':num_terms', $numTerms);
$stmt->execute();
$fieldCounter++;
}
}

public function getWordFromWordList($word)
{
$selectStmt = $this->index->prepare("SELECT * FROM wordlist WHERE term like :keyword LIMIT 1");
Expand Down
86 changes: 77 additions & 9 deletions src/TNTSearch.php
Original file line number Diff line number Diff line change
Expand Up @@ -101,25 +101,35 @@ public function search($phrase, $numOfResults = 100)
return $this->stemmer->stem($keyword);
});

$tfWeight = 1;
$dlWeight = 0.5;
$tfWeight = 1.2;
$dlWeight = 0.75;
$docScores = [];
$count = $this->totalDocumentsInCollection();
$avgFlen = $this->getAverageFieldLength();
$docTerms = array();

foreach ($keywords as $index => $term) {
$isLastKeyword = ($keywords->count() - 1) == $index;
$df = $this->totalMatchingDocuments($term, $isLastKeyword);
$idf = log($count / max(1, $df));
foreach ($this->getAllDocumentsForKeyword($term, false, $isLastKeyword) as $document) {
$docID = $document['doc_id'];
$tf = $document['hit_count'];
$idf = log(1 + ($count - $df + 0.5) / ($df + 0.5));
foreach ($this->getAllHitsForKeyword($term, true, $isLastKeyword) as $hit) {
$docID = $hit['doc_id'];
$tf = $hit['hit_count'];
$dlen = $hit['field_len'];
$fnorm = 1/sqrt($hit['field_len']);
$num = ($tfWeight + 1) * $tf;
$avgDlen = $avgFlen[$hit['field_id']];
$denom = $tfWeight
* ((1 - $dlWeight) + $dlWeight)
* ((1 - $dlWeight) + $dlWeight * $dlen / $avgDlen)
+ $tf;
$score = $idf * ($num / $denom);
$score = $fnorm * $idf * ($num / $denom);
$docScores[$docID] = isset($docScores[$docID]) ?
$docScores[$docID] + $score : $score;

if (!isset($docTerms[$docID])) {
$docTerms[$docID] = array();
}
$docTerms[$docID][$term] = 1;
}
}

Expand All @@ -128,7 +138,9 @@ public function search($phrase, $numOfResults = 100)
$docs = new Collection($docScores);

$totalHits = $docs->count();
$docs = $docs->map(function ($doc, $key) {
$docs = $docs->filter(function ($score, $docID) use ($docTerms, $keywords) {
return (count($docTerms[$docID]) == $keywords->count());
})->map(function ($doc, $key) {
return $key;
})->take($numOfResults);
$stopTimer = microtime(true);
Expand Down Expand Up @@ -254,6 +266,24 @@ public function getAllDocumentsForKeyword($keyword, $noLimit = false, $isLastKey
return $this->getAllDocumentsForStrictKeyword($word, $noLimit);
}

/**
* @param $keyword
* @param bool $noLimit
* @param bool $isLastKeyword
*
* @return Collection
*/
public function getAllHitsForKeyword($keyword, $noLimit = false, $isLastKeyword = false)
{
$word = $this->getWordlistByKeyword($keyword, $isLastKeyword);
if (!isset($word[0])) {
return new Collection([]);
}
// TODO: Fuzzy

return $this->getAllHitsForStrictKeyword($word, $noLimit);
}

/**
* @param $keyword
* @param bool $noLimit
Expand Down Expand Up @@ -506,4 +536,42 @@ private function getAllDocumentsForStrictKeyword($word, $noLimit)
$stmtDoc->execute();
return new Collection($stmtDoc->fetchAll(PDO::FETCH_ASSOC));
}

/**
* @param $word
* @param $noLimit
*
* @return Collection
*/
private function getAllHitsForStrictKeyword($word, $noLimit)
{
$query = "SELECT * FROM hitlist WHERE term_id = :id ORDER BY hit_count DESC";
// TODO: limit?
$stmtDoc = $this->index->prepare($query);

$stmtDoc->bindValue(':id', $word[0]['id']);
$stmtDoc->execute();
return new Collection($stmtDoc->fetchAll(PDO::FETCH_ASSOC));
}

/**
* @return $avgFieldLen
*/
private function getAverageFieldLength()
{
$query = "SELECT MAX(field_id) FROM docinfo";
$stmtDoc = $this->index->prepare($query);
$stmtDoc->execute();
$noFields = $stmtDoc->fetch(PDO::FETCH_NUM)[0] + 1;

$avgFlen = array();
for ($field = 0; $field < $noFields; $field++) {
$query = "SELECT AVG(num_terms) FROM docinfo WHERE field_id = :field_id";
$stmtDoc = $this->index->prepare($query);
$stmtDoc->bindValue(':field_id', $field);
$stmtDoc->execute();
$avgFlen[$field] = $stmtDoc->fetch(PDO::FETCH_NUM)[0];
}
return $avgFlen;
}
}