93 |
- |
1 |
<?php namespace Fuse\Bitap;
|
|
|
2 |
|
|
|
3 |
function search($text, $pattern, $patternAlphabet, $options = [])
|
|
|
4 |
{
|
|
|
5 |
$options = array_merge([
|
|
|
6 |
'location' => 0,
|
|
|
7 |
'distance' => 100,
|
|
|
8 |
'threshold' => 0.6,
|
|
|
9 |
'findAllMatches' => false,
|
|
|
10 |
'minMatchCharLength' => 1
|
|
|
11 |
], $options);
|
|
|
12 |
|
|
|
13 |
$expectedLocation = $options['location'];
|
|
|
14 |
// Set starting location at beginning text and initialize the alphabet.
|
|
|
15 |
$textLen = mb_strlen($text);
|
|
|
16 |
// Highest score beyond which we give up.
|
|
|
17 |
$currentThreshold = $options['threshold'];
|
|
|
18 |
// Is there a nearby exact match? (speedup)
|
|
|
19 |
$bestLocation = mb_strpos($text, $pattern, $expectedLocation);
|
|
|
20 |
|
|
|
21 |
$patternLen = mb_strlen($pattern);
|
|
|
22 |
|
|
|
23 |
// a mask of the matches
|
|
|
24 |
$matchMask = [];
|
|
|
25 |
for ($i = 0; $i < $textLen; $i++) {
|
|
|
26 |
$matchMask[$i] = 0;
|
|
|
27 |
}
|
|
|
28 |
|
|
|
29 |
if ($bestLocation !== false) {
|
|
|
30 |
$score = score($pattern, [
|
|
|
31 |
'errors' => 0,
|
|
|
32 |
'currentLocation' => $bestLocation,
|
|
|
33 |
'expectedLocation' => $expectedLocation,
|
|
|
34 |
'distance' => $options['distance']
|
|
|
35 |
]);
|
|
|
36 |
$currentThreshold = min($score, $currentThreshold);
|
|
|
37 |
|
|
|
38 |
// What about in the other direction? (speed up)
|
|
|
39 |
$bestLocation = mb_strrpos($text, $pattern, $expectedLocation + $patternLen);
|
|
|
40 |
|
|
|
41 |
if ($bestLocation !== false) {
|
|
|
42 |
$score = score($pattern, [
|
|
|
43 |
'errors' => 0,
|
|
|
44 |
'currentLocation' => $bestLocation,
|
|
|
45 |
'expectedLocation' => $expectedLocation,
|
|
|
46 |
'distance' => $options['distance']
|
|
|
47 |
]);
|
|
|
48 |
$currentThreshold = min($score, $currentThreshold);
|
|
|
49 |
}
|
|
|
50 |
}
|
|
|
51 |
|
|
|
52 |
// Reset the best location
|
|
|
53 |
$bestLocation = -1;
|
|
|
54 |
|
|
|
55 |
$lastBitArr = [];
|
|
|
56 |
$finalScore = 1;
|
|
|
57 |
$binMax = $patternLen + $textLen;
|
|
|
58 |
|
|
|
59 |
$mask = 1 << ($patternLen - 1);
|
|
|
60 |
|
|
|
61 |
for ($i = 0; $i < $patternLen; $i++) {
|
|
|
62 |
// Scan for the best match; each iteration allows for one more error.
|
|
|
63 |
// Run a binary search to determine how far from the match location we can stray
|
|
|
64 |
// at this error level.
|
|
|
65 |
$binMin = 0;
|
|
|
66 |
$binMid = $binMax;
|
|
|
67 |
|
|
|
68 |
while ($binMin < $binMid) {
|
|
|
69 |
$score = score($pattern, [
|
|
|
70 |
'errors' => $i,
|
|
|
71 |
'currentLocation' => $expectedLocation + $binMid,
|
|
|
72 |
'expectedLocation' => $expectedLocation,
|
|
|
73 |
'distance' => $options['distance']
|
|
|
74 |
]);
|
|
|
75 |
|
|
|
76 |
if ($score <= $currentThreshold) {
|
|
|
77 |
$binMin = $binMid;
|
|
|
78 |
} else {
|
|
|
79 |
$binMax = $binMid;
|
|
|
80 |
}
|
|
|
81 |
|
|
|
82 |
$binMid = floor(($binMax - $binMin) / 2 + $binMin);
|
|
|
83 |
}
|
|
|
84 |
|
|
|
85 |
// Use the result from this iteration as the maximum for the next.
|
|
|
86 |
$binMax = $binMid;
|
|
|
87 |
|
|
|
88 |
$start = max(1, $expectedLocation - $binMid + 1);
|
|
|
89 |
$finish = $options['findAllMatches']
|
|
|
90 |
? $textLen
|
|
|
91 |
: min($expectedLocation + $binMid, $textLen) + $patternLen;
|
|
|
92 |
|
|
|
93 |
// Initialize the bit array
|
|
|
94 |
$bitArr = [];
|
|
|
95 |
|
|
|
96 |
$bitArr[$finish + 1] = (1 << $i) - 1;
|
|
|
97 |
|
|
|
98 |
for ($j = $finish; $j >= $start; $j -= 1) {
|
|
|
99 |
$currentLocation = $j - 1;
|
|
|
100 |
|
|
|
101 |
$offset = mb_substr($text, $currentLocation, 1);
|
|
|
102 |
$charMatch = isset($patternAlphabet[$offset])
|
|
|
103 |
? $patternAlphabet[$offset]
|
|
|
104 |
: null;
|
|
|
105 |
|
|
|
106 |
if ($charMatch) {
|
|
|
107 |
$matchMask[$currentLocation] = 1;
|
|
|
108 |
}
|
|
|
109 |
|
|
|
110 |
// First pass: exact match
|
|
|
111 |
$bitArr[$j] = (($bitArr[$j + 1] << 1) | 1) & $charMatch;
|
|
|
112 |
|
|
|
113 |
// Subsequent passes: fuzzy match
|
|
|
114 |
if ($i !== 0) {
|
|
|
115 |
$bitArr[$j] |= ((($lastBitArr[$j + 1] | $lastBitArr[$j]) << 1) | 1) | $lastBitArr[$j + 1];
|
|
|
116 |
}
|
|
|
117 |
|
|
|
118 |
if ($bitArr[$j] & $mask) {
|
|
|
119 |
$finalScore = score($pattern, [
|
|
|
120 |
'errors' => $i,
|
|
|
121 |
'currentLocation' => $currentLocation,
|
|
|
122 |
'expectedLocation' => $expectedLocation,
|
|
|
123 |
'distance' => $options['distance']
|
|
|
124 |
]);
|
|
|
125 |
|
|
|
126 |
// This match will almost certainly be better than any existing match.
|
|
|
127 |
// But check anyway.
|
|
|
128 |
if ($finalScore <= $currentThreshold) {
|
|
|
129 |
// Indeed it is
|
|
|
130 |
$currentThreshold = $finalScore;
|
|
|
131 |
$bestLocation = $currentLocation;
|
|
|
132 |
|
|
|
133 |
// Already passed `loc`, downhill from here on in.
|
|
|
134 |
if ($bestLocation <= $expectedLocation) {
|
|
|
135 |
break;
|
|
|
136 |
}
|
|
|
137 |
|
|
|
138 |
// When passing `bestLocation`, don't exceed our current distance from `expectedLocation`.
|
|
|
139 |
$start = max(1, 2 * $expectedLocation - $bestLocation);
|
|
|
140 |
}
|
|
|
141 |
}
|
|
|
142 |
}
|
|
|
143 |
|
|
|
144 |
// No hope for a (better) match at greater error levels.
|
|
|
145 |
$score = score($pattern, [
|
|
|
146 |
'errors' => $i + 1,
|
|
|
147 |
'currentLocation' => $expectedLocation,
|
|
|
148 |
'expectedLocation' => $expectedLocation,
|
|
|
149 |
'distance' => $options['distance']
|
|
|
150 |
]);
|
|
|
151 |
|
|
|
152 |
if ($score > $currentThreshold) {
|
|
|
153 |
break;
|
|
|
154 |
}
|
|
|
155 |
|
|
|
156 |
$lastBitArr = $bitArr;
|
|
|
157 |
}
|
|
|
158 |
|
|
|
159 |
// Count exact matches (those with a score of 0) to be "almost" exact
|
|
|
160 |
return [
|
|
|
161 |
'isMatch' => $bestLocation >= 0,
|
|
|
162 |
'score' => $finalScore == 0 ? 0.001 : $finalScore,
|
|
|
163 |
'matchedIndices' => matched_indices($matchMask, $options['minMatchCharLength'])
|
|
|
164 |
];
|
|
|
165 |
}
|