|
一个用php写的中文分词类
- <?php
- classSegmentation{
- var$options=array('lowercase'=>TRUE,
- 'segment_english'=>FALSE);
- var$dict_name='Unknown';
- var$dict_words=array();
- functionsetLowercase($value){
- if($value){
- $this->options['lowercase']=TRUE;
- }else{
- $this->options['lowercase']=FALSE;
- }
- returnTRUE;
- }
- functionsetSegmentEnglish($value){
- if($value){
- $this->options['segment_english']=TRUE;
- }else{
- $this->options['segment_english']=FALSE;
- }
- returnTRUE;
- }
- functionload($dict_file){
- if(!file_exists($dict_file)){
- returnFALSE;
- }
- $fp=fopen($dict_file,'r');
- $temp=fgets($fp,1024);
- if($temp===FALSE){
- returnFALSE;
- }else{
- if(strpos($temp,"\t")!==FALSE){
- list($dict_type,$dict_name)=explode("\t",trim($temp));
- }else{
- $dict_type=trim($temp);
- $dict_name='Unknown';
- }
- $this->dict_name=$dict_name;
- if($dict_type!=='DICT_WORD_W'){
- returnFALSE;
- }
- }
- while(!feof($fp)){
- $this->dict_words[rtrim(fgets($fp,32))]=1;
- }
- fclose($fp);
- returnTRUE;
- }
- functiongetDictName(){
- return$this->dict_name;
- }
- functionsegmentString($str){
- if(count($this->dict_words)===0){
- returnFALSE;
- }
- $lines=explode("\n",$str);
- return$this->_segmentLines($lines);
- }
- functionsegmentFile($filename){
- if(count($this->dict_words)===0){
- returnFALSE;
- }
- $lines=file($filename);
- return$this->_segmentLines($lines);
- }
- function_segmentLines($lines){
- $contents_segmented='';
- foreach($linesas$line){
- $contents_segmented.=$this->_segmentLine(rtrim($line))."\n";
- }
- do{
- $contents_segmented=str_replace('','',$contents_segmented);
- }while(strpos($contents_segmented,'')!==FALSE);
- return$contents_segmented;
- }
- function_segmentLine($str){
- $str_final='';
- $str_array=array();
- $str_length=strlen($str);
- if($str_length>0){
- if(ord($str{$str_length-1})>=129){
- $str.='';
- }
- }
- for($i=0;$i<$str_length;$i++){
- if(ord($str{$i})>=129){
- $str_array[]=$str{$i}.$str{$i+1};
- $i++;
- }else{
- $str_tmp=$str{$i};
- for($j=$i+1;$j<$str_length;$j++){
- if(ord($str{$j})<129){
- $str_tmp.=$str{$j};
- }else{
- break;
- }
- }
- $str_array[]=array($str_tmp);
- $i=$j-1;
- }
- }
- $pos=count($str_array);
- while($pos>0){
- $char=$str_array[$pos-1];
- if(is_array($char)){
- $str_final_tmp=$char[0];
- if($this->options['segment_english']){
- $str_final_tmp=preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/","$1",$str_final_tmp);
- $str_final_tmp=preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])/","$1$2",$str_final_tmp);
- }
- if($this->options['lowercase']){
- $str_final_tmp=strtolower($str_final_tmp);
- }
- $str_final="$str_final_tmp$str_final";
- $pos--;
- }else{
- $word_found=0;
- $word_array=array(0=>'');
- if($pos<4){
- $word_temp=$pos+1;
- }else{
- $word_temp=5;
- }
- for($i=1;$i<$word_temp;$i++){
- $word_array[$i]=$str_array[$pos-$i].$word_array[$i-1];
- }
- for($i=($word_temp-1);$i>1;$i--){
- if(array_key_exists($word_array[$i],$this->dict_words)){
- $word_found=$i;
- break;
- }
- }
- if($word_found){
- $str_final="$word_array[$word_found]$str_final";
- $pos=$pos-$word_found;
- }else{
- $str_final="$char$str_final";
- $pos--;
- }
- }
- }
- return$str_final;
- }
- }
- ?>
来源参考:
http://www.phpchina.cn/code/2006/0607/381.html
http://www.xuchao.cn/?play=reply&id=851 |
|
|