diff --git a/src/Analysis/Analysis.php b/src/Analysis/Analysis.php index 72fd2bc..ae996a3 100644 --- a/src/Analysis/Analysis.php +++ b/src/Analysis/Analysis.php @@ -4,6 +4,6 @@ class Analysis implements AnalysisInterface { - - + // TODO 分词算法做一个单独的实例 + public function analyze(){} } diff --git a/src/Analysis/AnalysisInterface.php b/src/Analysis/AnalysisInterface.php index 64fb7e3..27b8ed2 100644 --- a/src/Analysis/AnalysisInterface.php +++ b/src/Analysis/AnalysisInterface.php @@ -4,6 +4,5 @@ interface AnalysisInterface { - - + public function analyze(); } diff --git a/src/Analysis/ChineseAnalysis.php b/src/Analysis/ChineseAnalysis.php index b576e71..721b0a5 100644 --- a/src/Analysis/ChineseAnalysis.php +++ b/src/Analysis/ChineseAnalysis.php @@ -17,8 +17,7 @@ class ChineseAnalysis implements ChineseAnalysisInterface */ public function __construct($source = '') { -// $this->addonDicFile = dirname(__FILE__) . '/' . $this->addonDicFile; -// $this->mainDicFile = dirname(__FILE__) . '/' . $this->mainDicFile; + $this->differMax = false; $this->unitWord = false; @@ -35,7 +34,6 @@ public function __construct($source = '') $this->addonDic = $additionDict; $this->loadTime = $loadTime; - $this->analysis = new Analysis(); } @@ -51,12 +49,14 @@ function __destruct() /** * 从文件获得词 + * * @param $key * @param $type (类型 word 或 key_groups) * @return short int */ - public function GetWordInfos($key, $type = 'word') + public function getWordInfos($key, $type = 'word') { + // TODO 简化算法 if (!$this->mainDicHand) { $this->mainDicHand = fopen($this->mainDicFile, 'r'); } @@ -94,6 +94,15 @@ public function GetWordInfos($key, $type = 'word') return ($type == 'word' ? $data[$key] : $data); } + + /** + * @param array $addonDic + */ + public function setAttach($addonDic = []) + { + $this->addonDic = $addonDic; + } + /** * 设置源字符串 * @param $source @@ -137,7 +146,7 @@ public function setSource($source) * * @return void */ - public function SetResultType($rstype) + public function setResultType($rstype) { $this->resultType = $rstype; } @@ -146,9 +155,9 @@ public function SetResultType($rstype) /** * 检测某个词是否存在 */ - public function IsWord($word) + public function isWord($word) { - $winfos = $this->GetWordInfos($word); + $winfos = $this->getWordInfos($word); return ($winfos !== false); } @@ -157,22 +166,22 @@ public function IsWord($word) * @parem $word unicode编码的词 * @return void */ - public function GetWordProperty($word) + public function getWordProperty($word) { if (strlen($word) < 4) { return '/s'; } - $infos = $this->GetWordInfos($word); + $infos = $this->getWordInfos($word); return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s"; } /** * 指定某词的词性信息(通常是新词) - * @parem $word unicode编码的词 - * @parem $infos array('c' => 词频, 'm' => 词性); + * @param $word unicode编码的词 + * @param $infos array('c' => 词频, 'm' => 词性); * @return void; */ - public function SetWordInfos($word, $infos) + public function setWordInfos($word, $infos) { if (strlen($word) < 4) { return; @@ -193,8 +202,6 @@ public function SetWordInfos($word, $infos) */ public function startAnalysis($optimize = true) { -// -// // $this->analysis->analysis( // [ // $this->sourceString, @@ -312,7 +319,7 @@ public function startAnalysis($optimize = true) $this->newWords[$tmpw] = 1; if (!isset($this->newWords[$tmpw])) { $this->foundWordStr .= StringTool::encoding($tmpw, $this->targetCharSet) . '/nb, '; - $this->SetWordInfos($tmpw, ['c' => 1, 'm' => 'nb']); + $this->setWordInfos($tmpw, ['c' => 1, 'm' => 'nb']); } $this->simpleResult[$s]['t'] = 13; @@ -404,14 +411,18 @@ public function startAnalysis($optimize = true) private function _deep_analysis(&$str, $ctype, $spos, $optimize = true) { + $notSplitLen = $this->notSplitLen; + $simpleResult = $this->simpleResult; + $addonDic = $this->addonDic; + //中文句子 if ($ctype == 1) { $slen = strlen($str); //小于系统配置分词要求长度的句子 - if ($slen < $this->notSplitLen) { + if ($slen < $notSplitLen) { $tmpstr = ''; $lastType = 0; - if ($spos > 0) $lastType = $this->simpleResult[$spos - 1]['t']; + if ($spos > 0) $lastType = $simpleResult[$spos - 1]['t']; if ($slen < 5) { //echo iconv(UCS2, 'utf-8', $str).'
'; if ($lastType == 4 && (isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]))) { @@ -425,7 +436,7 @@ private function _deep_analysis(&$str, $ctype, $spos, $optimize = true) $this->simpleResult[$spos - 1]['t'] = 4; if (!isset($this->newWords[$this->simpleResult[$spos - 1]['w']])) { $this->foundWordStr .= StringTool::encoding($ww, $this->targetCharSet) . '/mu, '; - $this->SetWordInfos($ww, ['c' => 1, 'm' => 'mu']); + $this->setWordInfos($ww, ['c' => 1, 'm' => 'mu']); } $this->simpleResult[$spos]['w'] = ''; if ($str2 != '') { @@ -450,11 +461,20 @@ private function _deep_analysis(&$str, $ctype, $spos, $optimize = true) $this->finallyResult[$spos][] = $str; } } + + $this->notSplitLen = $notSplitLen; + $this->simpleResult = $simpleResult; + $this->addonDic = $addonDic; } /** * 中文的深入分词 - * @parem $str + * @param $str + * @param $lastec + * @param $spos + * @param $slen + * @param $optimize + * * @return void */ private function _deep_analysis_cn(&$str, $lastec, $spos, $slen, $optimize = true) @@ -467,7 +487,7 @@ private function _deep_analysis_cn(&$str, $lastec, $spos, $slen, $optimize = tru $tmparr[] = $str; if (!isset($this->newWords[$str])) { $this->foundWordStr .= StringTool::encoding($str, $this->targetCharSet) . '/nq, '; - $this->SetWordInfos($str, ['c' => 1, 'm' => 'nq']); + $this->setWordInfos($str, ['c' => 1, 'm' => 'nq']); } if (!$this->differMax) { $this->finallyResult[$spos][] = $str; @@ -493,7 +513,7 @@ private function _deep_analysis_cn(&$str, $lastec, $spos, $slen, $optimize = tru $i = $i - 1; break; } - if ($this->IsWord($w)) { + if ($this->isWord($w)) { $tmparr[] = $w; $i = $i - $k + 1; $isok = true; @@ -515,10 +535,12 @@ private function _deep_analysis_cn(&$str, $lastec, $spos, $slen, $optimize = tru /** * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等) - * @parem $optimize 是否优化合并的结果 + * t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符 + * + * @param $smarr + * @param $spos 是否优化合并的结果 * @return bool */ - //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符 private function _optimize_result(&$smarr, $spos) { $newarr = []; @@ -534,7 +556,7 @@ private function _optimize_result(&$smarr, $spos) $this->simpleResult[$prePos]['t'] = 4; if (!isset($this->newWords[$this->simpleResult[$prePos]['w']])) { $this->foundWordStr .= StringTool::encoding($this->simpleResult[$prePos]['w'], $this->targetCharSet) . '/mu, '; - $this->SetWordInfos($this->simpleResult[$prePos]['w'], ['c' => 1, 'm' => 'mu']); + $this->setWordInfos($this->simpleResult[$prePos]['w'], ['c' => 1, 'm' => 'mu']); } $smarr[0] = ''; $i++; @@ -565,7 +587,7 @@ private function _optimize_result(&$smarr, $spos) $newarr[$j] = $cw . $nw; if (!isset($this->newWords[$newarr[$j]])) { $this->foundWordStr .= StringTool::encoding($newarr[$j], $this->targetCharSet) . '/mu, '; - $this->SetWordInfos($newarr[$j], ['c' => 1, 'm' => 'mu']); + $this->setWordInfos($newarr[$j], ['c' => 1, 'm' => 'mu']); } $j++; $i++; @@ -575,7 +597,7 @@ private function _optimize_result(&$smarr, $spos) $is_rs = false; //词语是副词或介词或频率很高的词不作为人名 if (strlen($nw) == 4) { - $winfos = $this->GetWordInfos($nw); + $winfos = $this->getWordInfos($nw); if (isset($winfos['m']) && ($winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) { $is_rs = true; } @@ -589,7 +611,7 @@ private function _optimize_result(&$smarr, $spos) $i++; } if (!isset($this->newWords[$newarr[$j]])) { - $this->SetWordInfos($newarr[$j], ['c' => 1, 'm' => 'nr']); + $this->setWordInfos($newarr[$j], ['c' => 1, 'm' => 'nr']); $this->foundWordStr .= StringTool::encoding($newarr[$j], $this->targetCharSet) . '/nr, '; } //为了防止错误,保留合并前的姓名 @@ -613,7 +635,7 @@ private function _optimize_result(&$smarr, $spos) $is_rs = false; //词语是副词或介词不作为前缀 if (strlen($cw) > 2) { - $winfos = $this->GetWordInfos($cw); + $winfos = $this->getWordInfos($cw); if (isset($winfos['m']) && ($winfos['m'] == 'a' || $winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) { $is_rs = true; } @@ -622,7 +644,7 @@ private function _optimize_result(&$smarr, $spos) $newarr[$j] = $cw . $nw; if (!isset($this->newWords[$newarr[$j]])) { $this->foundWordStr .= StringTool::encoding($newarr[$j], $this->targetCharSet) . '/na, '; - $this->SetWordInfos($newarr[$j], ['c' => 1, 'm' => 'na']); + $this->setWordInfos($newarr[$j], ['c' => 1, 'm' => 'na']); } $i++; $j++; @@ -642,7 +664,7 @@ private function _optimize_result(&$smarr, $spos) } if (!isset($this->newWords[$newarr[$j]])) { $this->foundWordStr .= StringTool::encoding($newarr[$j], $this->targetCharSet) . '/ms, '; - $this->SetWordInfos($newarr[$j], ['c' => 1, 'm' => 'ms']); + $this->setWordInfos($newarr[$j], ['c' => 1, 'm' => 'ms']); } $i++; $j++; @@ -660,7 +682,7 @@ private function _optimize_result(&$smarr, $spos) for ($y = 2; $y <= $slen - 2; $y = $y + 2) { $nhead = substr($nw, $y - 2, 2); $nfont = $cw . substr($nw, 0, $y - 2); - if ($this->IsWord($nfont . $nhead)) { + if ($this->isWord($nfont . $nhead)) { if (strlen($cw) > 2) $j++; $hasDiff = true; $newarr[$j] = $nfont . $nhead; @@ -677,6 +699,7 @@ private function _optimize_result(&$smarr, $spos) /** * 获取最终结果字符串(用空格分开后的分词结果) + * * @return string */ public function getFinallyResult($spword = ' ', $word_meanings = false) @@ -688,7 +711,7 @@ public function getFinallyResult($spword = ' ', $word_meanings = false) } $m = ''; if ($word_meanings) { - $m = $this->GetWordProperty($v['w']); + $m = $this->getWordProperty($v['w']); } $w = StringTool::encoding($v['w'], $this->targetCharSet); if ($w != ' ') { @@ -704,24 +727,27 @@ public function getFinallyResult($spword = ' ', $word_meanings = false) /** * 获取粗分结果,不包含粗分属性 + * * @return array() */ - public function GetSimpleResult() + public function getSimpleResult() { - $rearr = []; + $list = []; foreach ($this->simpleResult as $k => $v) { if (empty($v['w'])) continue; $w = StringTool::encoding($v['w'], $this->targetCharSet); - if ($w != ' ') $rearr[] = $w; + if ($w != ' ') $list[] = $w; } - return $rearr; + + return $list; } /** * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符) + * * @return array() */ - public function GetSimpleResultAll() + public function getSimpleResultAll() { $rearr = []; foreach ($this->simpleResult as $k => $v) { @@ -731,14 +757,16 @@ public function GetSimpleResultAll() $rearr[$k]['t'] = $v['t']; } } + return $rearr; } /** * 获取最终关键字(返回用 "," 间隔的关键字) + * * @return string */ - public function GetFinallyKeywords($num = 10) + public function getFinallyKeywords($num = 10) { $n = 0; @@ -758,9 +786,10 @@ public function GetFinallyKeywords($num = 10) $arr[$w] = 1; } } + arsort($arr); - $okstr = ''; + $finallyString = ''; foreach ($arr as $k => $v) { //排除长度为1的词 if (strlen($k) == 1) { @@ -773,12 +802,12 @@ public function GetFinallyKeywords($num = 10) elseif (strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) { continue; } - $okstr .= ($okstr == '' ? $k : ',' . $k); + $finallyString .= ($finallyString == '' ? $k : ',' . $k); $n++; if ($n > $num) break; } - return $okstr; + return $finallyString; } } diff --git a/src/Analysis/Loader.php b/src/Analysis/Loader.php index a400e63..aff5bf1 100644 --- a/src/Analysis/Loader.php +++ b/src/Analysis/Loader.php @@ -3,47 +3,47 @@ trait Loader { - - /** * 载入词典 * @param String $mainDic + * * @return array */ public function getLoadDict($mainDic = '') { - //常量定义 - $_SP_ = chr(0xFF) . chr(0xFE); - $UCS2 = 'ucs-2be'; - - $ADDITION_FILE = __DIR__ . '/dict/words_addons.dic'; - $mainDicFile = __DIR__ . '/dict/base_dic_full.dic'; -// $mainDicFile = null; - $mainDicHand = null; - $additionDict = []; + // $addonDicFile = dirname(__FILE__) . '/' . $this->addonDicFile; + // $mainDicFile = dirname(__FILE__) . '/' . $this->mainDicFile; + //常量定义 + $_SP_ = chr(0xFF) . chr(0xFE); + $UCS2 = 'ucs-2be'; + $additionFile = __DIR__ . '/dict/words_addons.dic'; + $mainDicFile = __DIR__ . '/dict/base_dic_full.dic'; + $mainDicHand = null; + $additionDict = []; + $startTime = microtime(true); + // $mainDicFile = null; - $startTime = microtime(true); //正常读取文件 - $dicAddon = $ADDITION_FILE; + $dicAddon = $additionFile; if ($mainDic == '' || !file_exists($mainDic)) { $dicWords = $mainDicFile; } else { - $dicWords = $mainDic; + $dicWords = $mainDic; $mainDicFile = $mainDic; } // 加载主词典(只打开) $mainDicHand = fopen($dicWords, 'r'); -// //加载附加的 分词 -// if (!empty($additionDict)) { -// $mainDicHand = $mainDicHand . $this->getAdditionDict(); -// } + // //加载附加的 分词 + // if (!empty($additionDict)) { + // $mainDicHand = $mainDicHand . $this->getAdditionDict(); + // } - //载入副词典 + // 载入副词典 $hw = ''; $ds = file($dicAddon); @@ -56,9 +56,9 @@ public function getLoadDict($mainDic = '') } else { $spstr = $_SP_; $spstr = iconv($UCS2, 'utf-8', $spstr); - $ws = explode(',', $d); - $wall = iconv('utf-8', $UCS2, join($spstr, $ws)); - $ws = explode($_SP_, $wall); + $ws = explode(',', $d); + $wall = iconv('utf-8', $UCS2, join($spstr, $ws)); + $ws = explode($_SP_, $wall); foreach ($ws as $estr) { $additionDict[$hw][$estr] = strlen($estr); } @@ -66,9 +66,10 @@ public function getLoadDict($mainDic = '') } $loadTime = microtime(true) - $startTime; -// $isLoadDic = true; - return [$mainDicHand,$mainDic, $additionDict, $loadTime]; + // $isLoadDic = true; + + return [$mainDicHand, $mainDic, $additionDict, $loadTime]; } diff --git a/src/Dict/DictGenerator.php b/src/Dict/DictGenerator.php index 0efd619..a281985 100644 --- a/src/Dict/DictGenerator.php +++ b/src/Dict/DictGenerator.php @@ -8,8 +8,12 @@ class DictGenerator { /** * 编译词典 - * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt> * 注意, 需要PHP开放足够的内存才能完成操作 + * utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt> + * + * @param $source_file + * @param $target_file + * * @return void */ public function MakeDict($source_file, $target_file = '') @@ -57,15 +61,19 @@ public function MakeDict($source_file, $target_file = '') /** * 导出词典的词条 - * @parem $targetfile 保存位置 - * @return void + * 保存位置 + * + * @param $targetFile + * + * @return bool */ - public function ExportDict($targetfile) + public function exportDict($targetFile) { if (!$this->mainDicHand) { $this->mainDicHand = fopen($this->mainDicFile, 'r'); } - $fp = fopen($targetfile, 'w'); + $fp = fopen($targetFile, 'w'); + // for ($i = 0; $i <= $this->mask_value; $i++) { // $move_pos = $i * 8; // fseek($this->mainDicHand, $move_pos, SEEK_SET); @@ -82,12 +90,18 @@ public function ExportDict($targetfile) // fwrite($fp, "{$w},{$v[0]},{$v[1]}\n"); // } // } + fwrite($fp, $this->ExportDictCore($this->mainDicHand)); fclose($fp); return true; } - public function ExportDictCore($source_str) + + /** + * @param $source_str + * @return string + */ + public function exportDictCore($source_str) { $str = ''; for ($i = 0; $i <= $this->mask_value; $i++) { @@ -103,10 +117,11 @@ public function ExportDictCore($source_str) if (!is_array($data)) continue; foreach ($data as $k => $v) { $w = iconv(UCS2, 'utf-8', $k); -// fwrite($fp, "{$w},{$v[0]},{$v[1]}\n"); + // fwrite($fp, "{$w},{$v[0]},{$v[1]}\n"); $str .= "{$w},{$v[0]},{$v[1]}\n"; } } + return $str; } diff --git a/src/Split/Split.php b/src/Split/Split.php index 0a3a5c0..5252b14 100755 --- a/src/Split/Split.php +++ b/src/Split/Split.php @@ -4,19 +4,37 @@ use phpSplit\Analysis\ChineseAnalysis; -class Split +/** + * php Split 主要接口提供 + * + * @package phpSplit\Split + */ +class Split implements SplitInterface { public $pa; public function __construct() { -// $this->loadConfig(); + // $this->loadConfig(); ChineseAnalysis::$loadInit = false; + $this->pa = new ChineseAnalysis('utf-8', 'utf-8', false); } + + /** + * 添加附加词 + * + * @param array $words + * @return void + */ + public function attach(array $words = []) + { + $this->pa->setAttach($words); + } + /** * 开始分词 * @@ -29,10 +47,14 @@ public function start($word = '') $this->pa->startAnalysis(true); $getInfo = true; - $sign = '-'; - $result = $this->pa->getFinallyResult($sign, $getInfo); - - return explode($sign, $result); + $sign = '-'; + $result = $this->pa->getFinallyResult($sign, $getInfo); + $result = explode($sign, $result); + $result = array_filter($result, function ($var) { + return !empty($var); + }); + + return $result; } /** @@ -47,13 +69,18 @@ public function simple($string = '') $this->pa->startAnalysis(true); $getInfo = true; - $sign = '-'; - $result = $this->pa->getFinallyResult($sign, $getInfo); + $sign = '-'; + $result = $this->pa->getFinallyResult($sign, $getInfo); + $result = explode($sign, $result); + $result = array_filter($result, function ($var) { + return !empty($var); + }); + + return array_map(function ($word) { + $word = explode('/', $word); - return array_map(function($word){ - $word = explode('/',$word); return $word[0]; - },explode($sign, $result)); + }, $result); } /** @@ -63,13 +90,12 @@ public function simple($string = '') */ public static function loadConfig() { - $files = [ - __DIR__ . '/Config.php', - ]; + $files = [__DIR__ . '/Config.php',]; foreach ($files as $file) { if (is_file($file)) { require_once($file); + return true; } } diff --git a/src/Split/SplitInterface.php b/src/Split/SplitInterface.php index 00dac53..8b29157 100755 --- a/src/Split/SplitInterface.php +++ b/src/Split/SplitInterface.php @@ -4,6 +4,26 @@ interface SplitInterface { + /** + * 驱动分词 + * + * @return mixed + */ + public function start(); + + /** + * 附加词 例如(康师傅手机) + * + * @return mixed + */ + public function attach(); + + /** + * 简单分词 (只是获得中文) filter sign , + * + * @return mixed + */ + public function simple(); } ?> diff --git a/tests/analysisTest.php b/tests/analysisTest.php index 4af4ee9..c7ca239 100644 --- a/tests/analysisTest.php +++ b/tests/analysisTest.php @@ -10,12 +10,14 @@ class analysisTest extends PHPUnit_Framework_TestCase { + /** + * 基本测试 + */ public function testAnalysis() { echo "analysis...\n"; - $str='lasticSearch(简称ES)由java语言实现,运行环境依赖java。ES 1. - 0/,查看页面信息,是否正常启动.status=200表示正常启动了,还有一些es的版本信息,name为配'; + $str='对于五到十人小型团队来说,什么样的协作开发方式比较合适?Bitbuket + Worktile + WizNote'; ChineseAnalysis::$loadInit = false; $pa = new ChineseAnalysis('utf-8', 'utf-8', false); // $pa->LoadDict(); diff --git a/tests/initTest.php b/tests/initTest.php index 9f33ea6..5807bfd 100755 --- a/tests/initTest.php +++ b/tests/initTest.php @@ -4,28 +4,48 @@ class initTest extends PHPUnit_Framework_TestCase { + /** + * 标准分词测试 + */ public function testIndex() { echo "test...\n"; $split = new Split(); - var_dump( $split->start("您好 phpSplit")); + var_dump( $split->start("您好phpSplit,不管怎么说你开心就好")); $this->assertTrue(True); } + /** + * 简单测试 + */ public function testSimple() { echo "test...\n"; $split = new Split(); - var_dump( $split->simple("您好 phpSplit")); + var_dump( $split->simple("您好phpSplit,不管怎么说你开心就好")); $this->assertTrue(True); } + + /** + * 附加词语测试 + */ + public function testAddonSimple() + { + echo "test attach ... \n"; + + $split = new Split(); + $split->attach(['康师傅手机']); + var_dump( $split->simple("您好phpSplit,你喜欢康师傅手机么?")); + + $this->assertTrue(True); + } }