这是一个功能性解决方案,仍然需要进行一些清理。我的一般算法是这样的:
- 将所有单词分解成一个列表w,
去除多余的空白和
标点符号
- 找到所有n-length的数组
从偏移量 0 开始的 w 块
- 找到所有n-length的数组
从偏移量 1 开始的 w 块
- ... 继续直到找到从偏移量 n-1
开始的 n 长度块数组
- 注意:如果w的最后一个chunk不是n-length,不要将它包含在chunk数组中
- 将所有块数组连接为 c
- 找到每个值的频率
c
$sample = 'Lorem *ipsum* dolor sit amet, consectetur adipiscing elit. Nunc auctor urna sed urna mattis nec interdum magna ullamcorper. Donec ut lorem eros, id rhoncus nisl. Praesent sodales lorem vitae sapien volutpat et accumsan lorem viverra. Proin lectus elit, cursus ut feugiat ut, porta sit amet leo. Cras est nisl, aliquet quis lobortis sit amet, viverra non erat. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Integer euismod scelerisque quam, et aliquet nibh dignissim at. Pellentesque ut elit neque. Etiam facilisis nisl eu mauris luctus in consequat libero volutpat. Pellentesque auctor, justo in suscipit mollis, erat justo sollicitudin ipsum, in cursus erat ipsum id turpis. In tincidunt hendrerit scelerisque.';
function buildPhrases($string, $length) {
$onlyWords = preg_replace('/\p{P}/', '', $string);
$wordArray = preg_split('/\s+/s', $onlyWords);
function buildPhraseChunks($wordArray, $length, $offset = 0)
{
if ($offset >= $length) {
return array();
} else {
$offsetWordArray = array_slice($wordArray, $offset);
return array_merge(
array_chunk($offsetWordArray, $length),
buildPhraseChunks(
$wordArray, $length, $offset + 1
)
);
}
}
$onlyLengthN = function ($n) {
return function($a) use ($n) {
return count($a) == $n;
};
};
$concatWords = function ($a, $b) {
return $a . ' ' . $b;
};
$reduce = function ($a) use ($concatWords) {
return array_reduce($a, $concatWords);
};
$format = function ($a) {
return strtolower(trim($a));
};
$chunks = array_filter(
buildPhraseChunks($wordArray, $length),
$onlyLengthN($length)
);
$phrases = array_map($reduce, $chunks);
$formattedPhrases = array_map($format, $phrases);
return $formattedPhrases;
}
$phrases = buildPhrases($sample, 1);
$dropOnes = function($a) {
return $a != 1;
};
$freqCount = array_filter(
array_count_values($phrases),
$dropOnes
);
arsort($freqCount);
print_r($freqCount);