worker321 发表于 2018-12-23 14:52:15

PHP中文分词

  PHP中文分词
  最常见的词语二分法:
  $str = '这是我的网站www.7di.net!';
  //$str = iconv('GB2312','UTF-8',$str);
  $result = spStr($str);
  print_r($result);
  /**
  * UTF-8版 中文二元分词
  */
  function spStr($str)
  {
  $cstr = array();
  $search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "\t", "\n", "'", "", "\r", "\r\n", "{1}quot;", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "[", "]", "、", "—", " ", "《", "》", "-", "…", "【", "】",);
  $str = str_replace($search, " ", $str);
  preg_match_all("/+/", $str, $estr);
  preg_match_all("/+/", $str, $nstr);
  $str = preg_replace("/+/", " ", $str);
  $str = preg_replace("/\s{2,}/", " ", $str);
  $str = explode(" ", trim($str));
  foreach ($str as $s) {
  $l = strlen($s);
  $bf = null;
  for ($i= 0; $i< $l; $i=$i+3) {
  $ns1 = $s{$i}.$s{$i+1}.$s{$i+2};
  if (isset($s{$i+3})) {
  $ns2 = $s{$i+3}.$s{$i+4}.$s{$i+5};
  if (preg_match("/[\x80-\xff]{3}/",$ns2)) $cstr[] = $ns1.$ns2;
  } else if ($i == 0) {
  $cstr[] = $ns1;
  }
  }
  }
  $estr = isset($estr)?$estr:array();
  $nstr = isset($nstr)?$nstr:array();
  return array_merge($nstr,$estr,$cstr);
  }
  執行結果是:
  Array ( => 7 => www => di => net => 这是 => 是我 => 我的 => 的网 => 网站 )
  接下来,将以上结果转换为区位码,PHP代码是:
  foreach ($result as $s) {
  $s = iconv('UTF-8','GB2312',$s);
  $code[] = gbCode($s);
  }
  $code = implode(" ", $code);
  echo $code;
  function gbCode($str) {
  $return = null;
  if (!preg_match("/^[\x80-\xff]{2,}$/",$str)) return $str;
  $len = strlen($str);
  for ($i= 0; $i< $len; $i=$i+2) {
  $return .= sprintf("%02d%02d",ord($str{$i})-160,ord($str{$i+1})-160);
  }
  return $return;
  }

页: [1]
查看完整版本: PHP中文分词