2012. 5. 17. 09:59, 유용한 지식 자료들/기타
Aho Corasick String Matching in Python
from collections import deque
class State:
sid = None ## store the id of state
value = None ## stores values of state
tranList = None ## used to store the list of next states for transition
outputSet = None ## it is set datastructure for storing the outputs at that state
failState = None
def __init__(self ,sid, val):
self.sid = sid
self.value = val
self.tranList = []
self.failState = 0
self.outputSet = set()
def getTransition(self, val):
""" this function gets the next state on input val"""
for node in self.tranList:
if node.value == val:
return node
return None
def testTransition(self, val):
""" This checks whether there is transition or not on input val"""
""" for current state, the transition is always true on any input"""
if self.sid == 0:
return True
else:
for nd in self.tranList:
if nd.value == val:
return True
return False
def addOutput(self, key):
"""This adds the key to the output in the state"""
self.outputSet = self.outputSet ^ key
##------------------------------------------------------------------------
class ahoCorasick:
root = None
newstate = None
def __init__(self):
self.root = State(0, ' ')
self.newstate = 0
def addKeyword(self, keywords):
"""Adds the keyword in the tree"""
for key in keywords.split(' '):
j = 0
state = 0
current = self.root
key = key.upper()
while j < len(key):
ch = key[j]
j = j+ 1
child = current.getTransition(ch)
if child != None:
current = child
else:
self.newstate = self.newstate +1
nd = State(self.newstate, ch)
current.tranList.append(nd)
current = nd
while j < len(key):
self.newstate = self.newstate +1
nd2 = State(self.newstate, key[j])
current.tranList.append(nd2)
current = nd2
j = j+1
break
current.outputSet.add(key)
##-------------------------------------------------------------------
def setFailTransitions(self):
"""Sets the fail transitions in tree"""
queue = deque()
current = self.root
child = self.root
for nd in self.root.tranList:
queue.append(nd)
nd.failState = self.root
while len(queue) != 0:
r = queue.popleft()
for nd in r.tranList:
queue.append(nd)
state = r.failState
val = nd.value
current = state
while True:
if current.testTransition(val) == False:
current = current.failState
else:
break
child = current.getTransition(val)
if child == None:
nd.failState = current
else:
nd.failState = child
nd.addOutput(nd.failState.outputSet)
##--------------------------------------------------------------------------------------------------
def findSubstrings(self, findStr):
""" Finds all substrings of input which are keywords in the tree"""
for string in findStr.split(' '):
string = string.upper()
print "Finding substrings in ", string
current = self.root
j = 0
while j < len(string):
while True:
if current.testTransition(string[j]) == False:
current = current.failState
else:
child = current.getTransition(string[j])
## print "before break", child.sid
break
if child != None:
## print "in none"
current = child
if len(child.outputSet) != 0:
print j
itr = iter(child.outputSet)
for keyw in itr:
print keyw
j = j + 1
##---------------------------------------------------------
def displayTree(self):
""" It is used to display the tree of keywords. Prints ID of node and value of node"""
queue = deque()
for nd in self.root.tranList:
queue.append(nd)
while len(queue) !=0:
node = queue.popleft()
for nd in node.tranList:
queue.append(nd)
print node.sid, node.value
def displayOutput(self):
""" This function displays the outputs at a state"""
queue = deque()
for nd in self.root.tranList:
queue.append(nd)
while len(queue) !=0:
node = queue.popleft()
for nd in node.tranList:
queue.append(nd)
itr = iter(node.outputSet)
if len(node.outputSet) !=0:
print node.sid
for string in itr:
print string
if (__name__ == "__main__"):
x = ahoCorasick()
""" Usage: Create object of ahoCorasick
to enter keywords use addKeyword("string of keywords")
then call setFailTransitions (fail function)
to find substrings of string use findSubstrings"""
## x.addKeyword("he")
## x.addKeyword("she")
## x.addKeyword("his")
## x.addKeyword("hers")
## x.addKeyword("ATC")
## x.addKeyword("TC")
## x.displayOutput()
#### x.enter("help")
#### x.enter("hi")
x.addKeyword("john jane")
x.setFailTransitions()
## x.findSubstrings("ACGATCTCTCGATC")
x.findSubstrings("johnjane")
class State:
sid = None ## store the id of state
value = None ## stores values of state
tranList = None ## used to store the list of next states for transition
outputSet = None ## it is set datastructure for storing the outputs at that state
failState = None
def __init__(self ,sid, val):
self.sid = sid
self.value = val
self.tranList = []
self.failState = 0
self.outputSet = set()
def getTransition(self, val):
""" this function gets the next state on input val"""
for node in self.tranList:
if node.value == val:
return node
return None
def testTransition(self, val):
""" This checks whether there is transition or not on input val"""
""" for current state, the transition is always true on any input"""
if self.sid == 0:
return True
else:
for nd in self.tranList:
if nd.value == val:
return True
return False
def addOutput(self, key):
"""This adds the key to the output in the state"""
self.outputSet = self.outputSet ^ key
##------------------------------------------------------------------------
class ahoCorasick:
root = None
newstate = None
def __init__(self):
self.root = State(0, ' ')
self.newstate = 0
def addKeyword(self, keywords):
"""Adds the keyword in the tree"""
for key in keywords.split(' '):
j = 0
state = 0
current = self.root
key = key.upper()
while j < len(key):
ch = key[j]
j = j+ 1
child = current.getTransition(ch)
if child != None:
current = child
else:
self.newstate = self.newstate +1
nd = State(self.newstate, ch)
current.tranList.append(nd)
current = nd
while j < len(key):
self.newstate = self.newstate +1
nd2 = State(self.newstate, key[j])
current.tranList.append(nd2)
current = nd2
j = j+1
break
current.outputSet.add(key)
##-------------------------------------------------------------------
def setFailTransitions(self):
"""Sets the fail transitions in tree"""
queue = deque()
current = self.root
child = self.root
for nd in self.root.tranList:
queue.append(nd)
nd.failState = self.root
while len(queue) != 0:
r = queue.popleft()
for nd in r.tranList:
queue.append(nd)
state = r.failState
val = nd.value
current = state
while True:
if current.testTransition(val) == False:
current = current.failState
else:
break
child = current.getTransition(val)
if child == None:
nd.failState = current
else:
nd.failState = child
nd.addOutput(nd.failState.outputSet)
##--------------------------------------------------------------------------------------------------
def findSubstrings(self, findStr):
""" Finds all substrings of input which are keywords in the tree"""
for string in findStr.split(' '):
string = string.upper()
print "Finding substrings in ", string
current = self.root
j = 0
while j < len(string):
while True:
if current.testTransition(string[j]) == False:
current = current.failState
else:
child = current.getTransition(string[j])
## print "before break", child.sid
break
if child != None:
## print "in none"
current = child
if len(child.outputSet) != 0:
print j
itr = iter(child.outputSet)
for keyw in itr:
print keyw
j = j + 1
##---------------------------------------------------------
def displayTree(self):
""" It is used to display the tree of keywords. Prints ID of node and value of node"""
queue = deque()
for nd in self.root.tranList:
queue.append(nd)
while len(queue) !=0:
node = queue.popleft()
for nd in node.tranList:
queue.append(nd)
print node.sid, node.value
def displayOutput(self):
""" This function displays the outputs at a state"""
queue = deque()
for nd in self.root.tranList:
queue.append(nd)
while len(queue) !=0:
node = queue.popleft()
for nd in node.tranList:
queue.append(nd)
itr = iter(node.outputSet)
if len(node.outputSet) !=0:
print node.sid
for string in itr:
print string
if (__name__ == "__main__"):
x = ahoCorasick()
""" Usage: Create object of ahoCorasick
to enter keywords use addKeyword("string of keywords")
then call setFailTransitions (fail function)
to find substrings of string use findSubstrings"""
## x.addKeyword("he")
## x.addKeyword("she")
## x.addKeyword("his")
## x.addKeyword("hers")
## x.addKeyword("ATC")
## x.addKeyword("TC")
## x.displayOutput()
#### x.enter("help")
#### x.enter("hi")
x.addKeyword("john jane")
x.setFailTransitions()
## x.findSubstrings("ACGATCTCTCGATC")
x.findSubstrings("johnjane")
'유용한 지식 자료들 > 기타' 카테고리의 다른 글
64비트 인지 아닌지 확인하기 (0) | 2012.07.18 |
---|---|
Clustering VS Classification (0) | 2012.06.27 |
리눅스에서 SSD (0) | 2012.05.15 |
How to Compile and Install Linux Kernel 3.1 / 3.0 in Ubuntu 11.10, 11.04, 10.10 and 10.04 (0) | 2012.05.15 |
PPT 활용에 대한 네이버 카페. (0) | 2012.03.14 |
Comments, Trackbacks