设为首页 收藏本站
查看: 798|回复: 0

[经验分享] 提取Python stringlib中的"BMHBNFS"字符串查找算法

[复制链接]

尚未签到

发表于 2017-5-6 12:13:30 | 显示全部楼层 |阅读模式
  
Python中的stringlib字符串查找算法是
Boyer-Moore,Horspool, Sunday, Bloom Filter几种算法的合成体, 大概的原理如下:

def find(s, p):# find first occurrence of p in sn = len(s)m = len(p)skip = delta1(p)[p[m-1]]i = 0while i <= n-m:if s[i+m-1] == p[m-1]: # (boyer-moore)# potential matchif s[i:i+m-1] == p[:m-1]:return iif s[i+m] not in p:i = i + m + 1 # (sunday)else:i = i + skip # (horspool)else:# skipif s[i+m] not in p:i = i + m + 1 # (sunday)else:i = i + 1return -1 # not found
以下是具体实现:

/* stringlib: fastsearch implementation */#ifndef STRINGLIB_FASTSEARCH_H#define STRINGLIB_FASTSEARCH_H#include <stdint.h>#include <unistd.h>#include <stdio.h>#include <stdlib.h>#include <string.h>/* fast search/count implementation, based on a mix between boyer-moore and horspool, with a few more bells and whistles on the top.for some more background, see: http://effbot.org/zone/stringlib.htm *//* note: fastsearch may access s[n], which isn't a problem when usingPython's ordinary string types, but may cause problems if you'reusing this code in other contexts.  also, the count mode returns -1if there cannot possible be a match in the target string, and 0 ifit has actually checked for matches, but didn't find any.  callersbeware! */#define FAST_COUNT 0#define FAST_SEARCH 1#define FAST_RSEARCH 2#ifndef LONG_BIT#define LONG_BIT 32#endif#if LONG_BIT >= 128#define STRINGLIB_BLOOM_WIDTH 128#elif LONG_BIT >= 64#define STRINGLIB_BLOOM_WIDTH 64#elif LONG_BIT >= 32#define STRINGLIB_BLOOM_WIDTH 32#else#error "LONG_BIT is smaller than 32"#endif#define STRINGLIB_BLOOM_ADD(mask, ch) \((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))#define STRINGLIB_BLOOM(mask, ch)     \((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))ssize_t fastsearch(const char *s, ssize_t n,const char *p, ssize_t m,ssize_t maxcount, int mode){unsigned long mask;ssize_t skip, count = 0;ssize_t i, j, mlast, w;w = n - m;if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) {return -1;}/* look for special cases */if (m <= 1) {if (m <= 0) {return -1;}/* use special case for 1-character strings */if (mode == FAST_COUNT) {for (i = 0; i < n; i++)if (s == p[0]) {count++;if (count == maxcount) {return maxcount;}}return count;}else if (mode == FAST_SEARCH) {for (i = 0; i < n; i++)if (s == p[0]) {return i;}}else {      /* FAST_RSEARCH */for (i = n - 1; i > -1; i--)if (s == p[0]) {return i;}}return -1;}mlast = m - 1;skip = mlast - 1;mask = 0;if (mode != FAST_RSEARCH) {/* create compressed boyer-moore delta 1 table *//* process pattern[:-1] */for (i = 0; i < mlast; i++) {STRINGLIB_BLOOM_ADD(mask, p);if (p == p[mlast]) {skip = mlast - i - 1;}}/* process pattern[-1] outside the loop */STRINGLIB_BLOOM_ADD(mask, p[mlast]);for (i = 0; i <= w; i++) {/* note: using mlast in the skip path slows things down on x86 */if (s[i + m - 1] == p[m - 1]) {/* candidate match */for (j = 0; j < mlast; j++)if (s[i + j] != p[j]) {break;}if (j == mlast) {/* got a match! */if (mode != FAST_COUNT) {return i;}count++;if (count == maxcount) {return maxcount;}i = i + mlast;continue;}/* miss: check if next character is part of pattern */if (!STRINGLIB_BLOOM(mask, s[i + m])) {i = i + m;}else {i = i + skip;}}else {/* skip: check if next character is part of pattern */if (!STRINGLIB_BLOOM(mask, s[i + m])) {i = i + m;}}}}else {      /* FAST_RSEARCH *//* create compressed boyer-moore delta 1 table *//* process pattern[0] outside the loop */STRINGLIB_BLOOM_ADD(mask, p[0]);/* process pattern[:0:-1] */for (i = mlast; i > 0; i--) {STRINGLIB_BLOOM_ADD(mask, p);if (p == p[0]) {skip = i - 1;}}for (i = w; i >= 0; i--) {if (s == p[0]) {/* candidate match */for (j = mlast; j > 0; j--)if (s[i + j] != p[j]) {break;}if (j == 0)/* got a match! */{return i;}/* miss: check if previous character is part of pattern */if (!STRINGLIB_BLOOM(mask, s[i - 1])) {i = i - m;}else {i = i - skip;}}else {/* skip: check if previous character is part of pattern */if (!STRINGLIB_BLOOM(mask, s[i - 1])) {i = i - m;}}}}if (mode != FAST_COUNT) {return -1;}return count;}#endif  

  测试代码
#include <arch/cycle.h>int main(int argc, char **argv){char *str = "GET / HTTP 1.0\r\nHost: www.xxx.com\r\nCache: \r\nCache:\r\n Length:\r\n";ssize_t rc = 0;uint64_t start, end;start = get_cycle_count();rc = fastsearch(str, strlen(str), "Cache:", 6, 2, FAST_SEARCH);end = get_cycle_count();printf("fastsearch return %u cost %llu \n", rc, end - start);printf("result = %s\n", str + rc);rc = fastsearch(str, strlen(str), "Cache:", 6, -1, FAST_COUNT);printf("result = %u\n", rc);return 0;}
看stringlib测试数据, 还是蛮可以的.
  我在tile平台上测试发现还没有snort中的BMH算法速度快.
  不过这个只是单一测试, 没有考虑到cache的情况, 仅供参考.
  

  原文参考:
The stringlibLibrary有详细的描叙.

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-373797-1-1.html 上篇帖子: Python常用日期函数日期增减日期格式化 下篇帖子: python htmllib.HTMLParser处理A标签获取链接和描述
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表