[转]一个用php写的中文分词类
一个用php写的中文分词类
[*]<?php
[*]classSegmentation{
[*]var$options=array('lowercase'=>TRUE,
[*]'segment_english'=>FALSE);
[*]var$dict_name='Unknown';
[*]var$dict_words=array();
[*]functionsetLowercase($value){
[*]if($value){
[*]$this->options['lowercase']=TRUE;
[*]}else{
[*]$this->options['lowercase']=FALSE;
[*]}
[*]returnTRUE;
[*]}
[*]functionsetSegmentEnglish($value){
[*]if($value){
[*]$this->options['segment_english']=TRUE;
[*]}else{
[*]$this->options['segment_english']=FALSE;
[*]}
[*]returnTRUE;
[*]}
[*]functionload($dict_file){
[*]if(!file_exists($dict_file)){
[*]returnFALSE;
[*]}
[*]$fp=fopen($dict_file,'r');
[*]$temp=fgets($fp,1024);
[*]if($temp===FALSE){
[*]returnFALSE;
[*]}else{
[*]if(strpos($temp,"\t")!==FALSE){
[*]list($dict_type,$dict_name)=explode("\t",trim($temp));
[*]}else{
[*]$dict_type=trim($temp);
[*]$dict_name='Unknown';
[*]}
[*]$this->dict_name=$dict_name;
[*]if($dict_type!=='DICT_WORD_W'){
[*]returnFALSE;
[*]}
[*]}
[*]while(!feof($fp)){
[*]$this->dict_words[rtrim(fgets($fp,32))]=1;
[*]}
[*]fclose($fp);
[*]returnTRUE;
[*]}
[*]functiongetDictName(){
[*]return$this->dict_name;
[*]}
[*]functionsegmentString($str){
[*]if(count($this->dict_words)===0){
[*]returnFALSE;
[*]}
[*]$lines=explode("\n",$str);
[*]return$this->_segmentLines($lines);
[*]}
[*]functionsegmentFile($filename){
[*]if(count($this->dict_words)===0){
[*]returnFALSE;
[*]}
[*]$lines=file($filename);
[*]return$this->_segmentLines($lines);
[*]}
[*]function_segmentLines($lines){
[*]$contents_segmented='';
[*]foreach($linesas$line){
[*]$contents_segmented.=$this->_segmentLine(rtrim($line))."\n";
[*]}
[*]do{
[*]$contents_segmented=str_replace('','',$contents_segmented);
[*]}while(strpos($contents_segmented,'')!==FALSE);
[*]return$contents_segmented;
[*]}
[*]function_segmentLine($str){
[*]$str_final='';
[*]$str_array=array();
[*]$str_length=strlen($str);
[*]if($str_length>0){
[*]if(ord($str{$str_length-1})>=129){
[*]$str.='';
[*]}
[*]}
[*]for($i=0;$i<$str_length;$i++){
[*]if(ord($str{$i})>=129){
[*]$str_array[]=$str{$i}.$str{$i+1};
[*]$i++;
[*]}else{
[*]$str_tmp=$str{$i};
[*]for($j=$i+1;$j<$str_length;$j++){
[*]if(ord($str{$j})<129){
[*]$str_tmp.=$str{$j};
[*]}else{
[*]break;
[*]}
[*]}
[*]$str_array[]=array($str_tmp);
[*]$i=$j-1;
[*]}
[*]}
[*]
[*]$pos=count($str_array);
[*]while($pos>0){
[*]$char=$str_array[$pos-1];
[*]if(is_array($char)){
[*]$str_final_tmp=$char[0];
[*]if($this->options['segment_english']){
[*]$str_final_tmp=preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/","$1",$str_final_tmp);
[*]$str_final_tmp=preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])/","$1$2",$str_final_tmp);
[*]}
[*]if($this->options['lowercase']){
[*]$str_final_tmp=strtolower($str_final_tmp);
[*]}
[*]$str_final="$str_final_tmp$str_final";
[*]$pos--;
[*]}else{
[*]$word_found=0;
[*]$word_array=array(0=>'');
[*]if($pos<4){
[*]$word_temp=$pos+1;
[*]}else{
[*]$word_temp=5;
[*]}
[*]for($i=1;$i<$word_temp;$i++){
[*]$word_array[$i]=$str_array[$pos-$i].$word_array[$i-1];
[*]}
[*]
[*]for($i=($word_temp-1);$i>1;$i--){
[*]
[*]if(array_key_exists($word_array[$i],$this->dict_words)){
[*]$word_found=$i;
[*]break;
[*]}
[*]}
[*]if($word_found){
[*]$str_final="$word_array[$word_found]$str_final";
[*]$pos=$pos-$word_found;
[*]}else{
[*]$str_final="$char$str_final";
[*]$pos--;
[*]}
[*]}
[*]}
[*]return$str_final;
[*]}
[*]}
[*]?>
来源参考:
http://www.phpchina.cn/code/2006/0607/381.html
http://www.xuchao.cn/?play=reply&id=851
页:
[1]