* @license https://opensource.org/licenses/MIT MIT License * @link http://github.com/kevinfiol */ class Fuzz { private $_source; private $_sourceLen; private $_maxResults; private $_searchMode; private $_useLCS; /** * Fuzz Object Constructor * Initialize private variables * * @param array $source An array of associative arrays * @param int $maxResults The maximum number of results to retrieve upon a search * @param int $searchMode 0 = Levenshtein, 1 = Jaro-Winkler * @param boolean $useLCS Factor in Longest Common Substring in search results */ public function __construct($source, $maxResults, $searchMode, $useLCS) { $this->_source = $source; $this->_sourceLen = count($source); $this->_maxResults = max($maxResults, 1); $this->_useLCS = $useLCS; if ($searchMode < 0 || $searchMode > 1) { throw new \Exception('Invalid search mode'); } else { $this->_searchMode = $searchMode; } } /** * Search Method * Initiate Search * * @param string $search Term to search for * @param int $minLCS (if using LCS) Specify the minimum longest common substring * @param int $maxDistance (if using Levenshtein) Specify the maximum distance allowed * * @return array $results Array of associative arrays containing search matches */ public function search($search, $minLCS = null, $maxDistance = null) { $results = []; $scores = []; // Nullify these parameters if they are irrelevant to searchMode if (!$this->_useLCS) $minLCS = null; if ($this->_searchMode != 0) $maxDistance = null; // Cycle through result pool //for ($i = 0; $i < $this->_sourceLen; $i++) { foreach ($this->_source as $pageKey => $data) { $allLev = []; $allJaros = []; $allLCSs = []; // Cycle through each object's properties foreach ($data as $key => $val) { if ($this->_searchMode == 0) { $allLev[] = $this->getLevenshtein(strval($val), $search); } elseif ($this->_searchMode == 1) { $allJaros[] = $this->getJaroWinkler(strval($val), $search); } if ($this->_useLCS) { $allLCSs[] = $this->getLCS(strval($val), $search); } } $lowestLev = $allLev ? min($allLev) : null; $highestJaro = $allJaros ? max($allJaros) : null; $highestLCS = $allLCSs ? max($allLCSs) : null; // Get Score if ($this->_searchMode == 0) { $score = $lowestLev; } else { $score = -1 * abs($highestJaro); } if ($this->_useLCS) { $score -= $highestLCS; } // Append Index of object + Best Score if (($maxDistance == null || $lowestLev <= $maxDistance) && ($minLCS == null || $highestLCS >= $minLCS) ) { $scores[$pageKey] = $score; } } // Sort by score asort($scores); return $scores; } /** * Get Longest Common Substring * * @param string $source Term to search for * @param string $target Target term to search against * * @return int $result LCS Score */ public function getLCS($source, $target) { $suffix = []; $result = 0; $n = mb_strlen($source, CHARSET); $m = mb_strlen($target, CHARSET); for ($i = 0; $i <= $n; $i++) { for ($j = 0; $j <= $m; $j++) { if ($i === 0 || $j === 0) { $suffix[$i][$j] = 0; } elseif ($source[$i - 1] == $target[$j - 1]) { $suffix[$i][$j] = $suffix[$i - 1][$j - 1] + 1; $result = max($result, $suffix[$i][$j]); } else { $suffix[$i][$j] = 0; } } } return $result; } /** * Get Levenshtein Distance * * @param string $source Term to search for * @param string $target Target term to search against * * @return int Levenshtein Distance */ public function getLevenshtein($source, $target) { $matrix = []; $n = mb_strlen($source, CHARSET); $m = mb_strlen($target, CHARSET); if ($n === 0) { return $m; } elseif ($m === 0) { return $n; } // Initialize First Row for ($i = 0; $i <= $n; $i++) { $matrix[0][$i] = $i; } // Initialize First Column for ($i = 0; $i <= $m; $i++) { $matrix[$i][0] = $i; } for ($i = 1; $i <= $n; $i++) { for ($j = 1; $j <= $m; $j++) { if ($source[$i - 1] === $target[$j - 1]) { $cost = 0; } else { $cost = 1; } // Cell immediately above + 1 $up = $matrix[$j - 1][$i] + 1; // Cell immediately to the left + 1 $left = $matrix[$j][$i - 1] + 1; // Cell diagnolly above and to the left + cost $upleft = $matrix[$j - 1][$i - 1] + $cost; $matrix[$j][$i] = min($up, $left, $upleft); } } return $matrix[$m][$n]; } /** * Get Jaro-Winkler Score * * @param string $first String to match * @param string $second String to match * * @return double $jaroWinkler Jaro-Winkler score between 0.0 and 1.0 */ public function getJaroWinkler($first, $second) { $shorter = ''; $longer = ''; if (mb_strlen($first, CHARSET) > mb_strlen($second, CHARSET)) { $longer = mb_strtolower($first, CHARSET); $shorter = mb_strtolower($second, CHARSET); } else { $longer = mb_strtolower($second, CHARSET); $shorter = mb_strtolower($first, CHARSET); } // Get half the length distance of shorter string $halfLen = intval((mb_strlen($shorter,CHARSET) / 2) + 1); $match1 = $this->_getCharMatch($shorter, $longer, $halfLen); $match2 = $this->_getCharMatch($longer, $shorter, $halfLen); if ((mb_strlen($match1, CHARSET) == 0 || mb_strlen($match2, CHARSET) == 0) || (mb_strlen($match1, CHARSET) != mb_strlen($match2, CHARSET)) ) { return 0.0; } $trans = $this->_getTranspositions($match1, $match2); $distance = (mb_strlen($match1, CHARSET) / mb_strlen($shorter, CHARSET) + mb_strlen($match2, CHARSET) / mb_strlen($longer, CHARSET) + (mb_strlen($match1, CHARSET) - $trans) / mb_strlen($match1, CHARSET)) / 3.0; // Apply Winkler Adjustment $prefixLen = min(mb_strlen($this->_getPrefix($first, $second),CHARSET), 4); $jaroWinkler = round(($distance + (0.1 * $prefixLen * (1.0 - $distance))) * 100.0) / 100.0; return $jaroWinkler; } /** * Get Character Matches * * @param string $first String to match * @param string $second String to match * @param int $limit Limit of characters to match * * @return string $common Common substring */ private function _getCharMatch($first, $second, $limit) { $common = ''; $copy = $second; $firstLen = mb_strlen($first, CHARSET); $secondLen = mb_strlen($second, CHARSET); for ($i = 0; $i < $firstLen; $i++) { $char = $first[$i]; $found = false; for ($j = max(0, $i - $limit); !$found && $j < min($i + $limit, $secondLen); $j++) { if ($copy[$j] == $char) { $found = true; $common .= $char; $copy[$j] = '*'; } } } return $common; } /** * Get Transpositions * * @param string $first String to match * @param string $second String to match * * @return int $trans Number of transpositions between strings */ private function _getTranspositions($first, $second) { $trans = 0; $firstLen = mb_strlen($first, CHARSET); for ($i = 0; $i < $firstLen; $i++) { if ($first[$i] != $second[$i]) { $trans += 1; } } $trans /= 2; return $trans; } /** * Get Prefix * * @param string $first String to match * @param string $second String to match * * @return string Returns substring representing the longest prefix */ private function _getPrefix($first, $second) { if (mb_strlen($first, CHARSET) == 0 || mb_strlen($second, CHARSET) == 0) { return ''; } $index = $this->_getDiffIndex($first, $second); if ($index == -1) { return $first; } elseif ($index == 0) { return ''; } else { return mb_substr($first, 0, $index, CHARSET); } } /** * Get Difference Index * * @param string $first String to match * @param string $second String to match * * @return Return index of first difference */ private function _getDiffIndex($first, $second) { if ($first == $second) { return -1; } $maxLen = min(mb_strlen($first, CHARSET), mb_strlen($second, CHARSET)); for ($i = 0; $i < $maxLen; $i++) { if ($first[$i] != $second[$i]) { return $i; } } return $maxLen; } /** * Print Matrix * Utility / Testing function for testing purposes * * @param array $arr 2-dimensional array representing a matrix * * @return void */ private function _printMatrix($arr) { $str = ''; $width = count($arr[0]); $height = count($arr); for ($i = 0; $i < $height; $i++) { for ($j = 0; $j < $width; $j++) { if (!isset($arr[$i][$j])) { $arr[$i][$j] = ' '; } $str = $str . "[{$arr[$i][$j]}]"; if ($j === $width - 1) { $str = $str . PHP_EOL; } } } print($str); } }