defBF(s, p): ''' Brute-Force算法实现字符串匹配 ''' indies = [] n = len(s) m = len(p) for i in range(n - m + 1): index = i for j in range(m): if s[index] == p[j]: index += 1 else: break if index - i == m: indies.append(i) return indies
defgetNextList(s): n = len(s) nextList = [0, 0] j = 0 for i in range(1, n): while j > 0and s[i] != s[j]: j = nextList[j] if s[i] == s[j]: j += 1 nextList.append(j) return nextList
defKMP(s, p): ''' Knuth-Morris-Pratt算法实现字符串匹配 ''' n = len(s) m = len(p) nextList = getNextList(p) indies = [] j = 0 for i in range(n): while s[i] != p[j] and j > 0: j = nextList[j]
if s[i] == p[j]: j += 1 if j == m: indies.append(i-m+1) j = nextList[j] return indies
defaddWord(self, word): ''' 添加关键词到Tire树中 ''' tmp = self.__root for i in range(0, len(word)): if word[i] notin tmp.next: tmp.next[word[i]] = Node() tmp = tmp.next[word[i]] tmp.isWord = True
defmake(self): ''' 构建自动机,失效函数 ''' tmpQueue = [] tmpQueue.append(self.__root) while(len(tmpQueue) > 0): temp = tmpQueue.pop() p = None for k, v in temp.next.items(): if temp == self.__root: temp.next[k].fail = self.__root else: p = temp.fail while p isnotNone: if k in p.next: temp.next[k].fail = p.next[k] break p = p.fail if p isNone : temp.next[k].fail = self.__root tmpQueue.append(temp.next[k])
defsearch(self, content): p = self.__root result = [] startWordIndex = 0 endWordIndex = -1 currentPosition = 0
while currentPosition < len(content): word = content[currentPosition] # 检索状态机,直到匹配 while word notin p.next and p != self.__root: p = p.fail
if word in p.next: if p == self.__root: # 若当前节点是根且存在转移状态,则说明是匹配词的开头,记录词的起始位置 startWordIndex = currentPosition # 转移状态机的状态 p = p.next[word] else: p = self.__root
if p.isWord: # 若状态为词的结尾,则把词放进结果集 result.append((startWordIndex, currentPosition))
currentPosition += 1 return result
defreplace(self, content): replacepos = self.search(content) result = content for i in replacepos: result = result[0:i[0]] + (i[1] - i[0] + 1) * u'*' + content[i[1] + 1:] return result
if __name__ == '__main__': ah = Ahocorasick() text = '中国科学技术大学位于安徽合肥,是一所理工科大学,又称为南七技校。' patterns = '科学 大学 理工科 技校' words = patterns.split(' ') for w in words: ah.addWord(w) ah.make() results = ah.search(text) print(results) if len(results) == 0: print('No find.') else: print(len(results),' matching results are listed below.') print('-------' + '-'*len(text) + '-------') print(text) count = 0 for site in results: w = text[site[0]:site[1]+1] count += 1 print(' '*site[0] + w + ' '*(len(text)-site[1]) + ' ' + str(site[0]) + ' ' + str(count)) print('-------' + '-'*len(text) + '-------')