Subversion Repositories cheapmusic

Rev

Blame | Last modification | View Log | RSS feed

<?php namespace Fuse\Bitap;

function search($text, $pattern, $patternAlphabet, $options = [])
{
    $options = array_merge([
        'location' => 0,
        'distance' => 100,
        'threshold' => 0.6,
        'findAllMatches' => false,
        'minMatchCharLength' => 1
    ], $options);

    $expectedLocation  = $options['location'];
    // Set starting location at beginning text and initialize the alphabet.
    $textLen = mb_strlen($text);
    // Highest score beyond which we give up.
    $currentThreshold = $options['threshold'];
    // Is there a nearby exact match? (speedup)
    $bestLocation = mb_strpos($text, $pattern, $expectedLocation);

    $patternLen = mb_strlen($pattern);

    // a mask of the matches
    $matchMask = [];
    for ($i = 0; $i < $textLen; $i++) {
        $matchMask[$i] = 0;
    }

    if ($bestLocation !== false) {
        $score = score($pattern, [
            'errors' => 0,
            'currentLocation' => $bestLocation,
            'expectedLocation' => $expectedLocation,
            'distance' => $options['distance']
        ]);
        $currentThreshold = min($score, $currentThreshold);

        // What about in the other direction? (speed up)
        $bestLocation = mb_strrpos($text, $pattern, $expectedLocation + $patternLen);

        if ($bestLocation !== false) {
            $score = score($pattern, [
                'errors' => 0,
                'currentLocation' => $bestLocation,
                'expectedLocation' => $expectedLocation,
                'distance' => $options['distance']
            ]);
            $currentThreshold = min($score, $currentThreshold);
        }
    }

    // Reset the best location
    $bestLocation = -1;

    $lastBitArr = [];
    $finalScore = 1;
    $binMax = $patternLen + $textLen;

    $mask = 1 << ($patternLen - 1);

    for ($i = 0; $i < $patternLen; $i++) {
        // Scan for the best match; each iteration allows for one more error.
        // Run a binary search to determine how far from the match location we can stray
        // at this error level.
        $binMin = 0;
        $binMid = $binMax;

        while ($binMin < $binMid) {
            $score = score($pattern, [
                'errors' => $i,
                'currentLocation' => $expectedLocation + $binMid,
                'expectedLocation' => $expectedLocation,
                'distance' => $options['distance']
            ]);

            if ($score <= $currentThreshold) {
                $binMin = $binMid;
            } else {
                $binMax = $binMid;
            }

            $binMid = floor(($binMax - $binMin) / 2 + $binMin);
        }

        // Use the result from this iteration as the maximum for the next.
        $binMax = $binMid;

        $start = max(1, $expectedLocation - $binMid + 1);
        $finish = $options['findAllMatches']
            ? $textLen
            : min($expectedLocation + $binMid, $textLen) + $patternLen;

        // Initialize the bit array
        $bitArr = [];

        $bitArr[$finish + 1] = (1 << $i) - 1;

        for ($j = $finish; $j >= $start; $j -= 1) {
            $currentLocation = $j - 1;

            $offset = mb_substr($text, $currentLocation, 1);
            $charMatch = isset($patternAlphabet[$offset])
                ? $patternAlphabet[$offset]
                : null;

            if ($charMatch) {
                $matchMask[$currentLocation] = 1;
            }

            // First pass: exact match
            $bitArr[$j] = (($bitArr[$j + 1] << 1) | 1) & $charMatch;

            // Subsequent passes: fuzzy match
            if ($i !== 0) {
                $bitArr[$j] |= ((($lastBitArr[$j + 1] | $lastBitArr[$j]) << 1) | 1) | $lastBitArr[$j + 1];
            }

            if ($bitArr[$j] & $mask) {
                $finalScore = score($pattern, [
                    'errors' => $i,
                    'currentLocation' => $currentLocation,
                    'expectedLocation' => $expectedLocation,
                    'distance' => $options['distance']
                ]);

                // This match will almost certainly be better than any existing match.
                // But check anyway.
                if ($finalScore <= $currentThreshold) {
                    // Indeed it is
                    $currentThreshold = $finalScore;
                    $bestLocation = $currentLocation;

                    // Already passed `loc`, downhill from here on in.
                    if ($bestLocation <= $expectedLocation) {
                        break;
                    }

                    // When passing `bestLocation`, don't exceed our current distance from `expectedLocation`.
                    $start = max(1, 2 * $expectedLocation - $bestLocation);
                }
            }
        }

        // No hope for a (better) match at greater error levels.
        $score = score($pattern, [
            'errors' => $i + 1,
            'currentLocation' => $expectedLocation,
            'expectedLocation' => $expectedLocation,
            'distance' => $options['distance']
        ]);

        if ($score > $currentThreshold) {
            break;
        }

        $lastBitArr = $bitArr;
    }

    // Count exact matches (those with a score of 0) to be "almost" exact
    return [
        'isMatch' => $bestLocation >= 0,
        'score' => $finalScore == 0 ? 0.001 : $finalScore,
        'matchedIndices' => matched_indices($matchMask, $options['minMatchCharLength'])
    ];
}