CreateMutex
까보면 다나와~

Aho Corasick String Matching in Python

Aho Corasick String Matching in Python

from collections import deque
class State:
    sid = None        ## store the id of state
    value = None      ## stores values of state
    tranList = None    ## used to store the list of next states for transition
    outputSet = None    ## it is set datastructure for storing the outputs at that state
    failState = None

    def __init__(self ,sid, val):
        self.sid = sid
        self.value = val
        self.tranList = []
        self.failState = 0
        self.outputSet = set()

    def getTransition(self, val):
        """ this function gets the next state on input val"""
        for node in self.tranList:
            if node.value == val:
                return node
        return None
   

    def testTransition(self, val):
        """ This checks whether there is transition or not on input val"""
        """ for current state, the transition is always true on any input"""

        if self.sid == 0:    
           return True
        else:
            for nd in self.tranList:
                if nd.value == val:
                    return True
            return False
       
    def addOutput(self, key):
        """This adds the key to the output in the state"""
        self.outputSet = self.outputSet ^ key
       
         
    ##------------------------------------------------------------------------
   

class ahoCorasick:
    root = None
    newstate = None

    def __init__(self):
        self.root = State(0, ' ')
        self.newstate = 0

    def addKeyword(self, keywords):
        """Adds the keyword in the tree"""
   
        for key in keywords.split(' '):
           
            j = 0
            state = 0
            current = self.root
            key = key.upper()

            while j < len(key):
                ch = key[j]
                j = j+ 1
                child = current.getTransition(ch)
                if child != None:
                    current = child
                else:
                    self.newstate = self.newstate +1
                    nd = State(self.newstate, ch)
                    current.tranList.append(nd)
                    current = nd
                    while j < len(key):
                        self.newstate = self.newstate +1
                        nd2 = State(self.newstate, key[j])
                        current.tranList.append(nd2)
                        current = nd2
                        j = j+1
                    break
            current.outputSet.add(key)
       
##-------------------------------------------------------------------
    def setFailTransitions(self):
        """Sets the fail transitions in tree"""
        queue = deque()
        current = self.root
        child = self.root

        for nd in self.root.tranList:
            queue.append(nd)
            nd.failState = self.root

        while len(queue) != 0:
            r = queue.popleft()
            for nd in r.tranList:
                queue.append(nd)
                state = r.failState
                val = nd.value
                current = state
                while True:
                    if current.testTransition(val) == False:
                        current = current.failState
                    else:
                        break
                child = current.getTransition(val)
                if child == None:
                    nd.failState = current
                else:
                    nd.failState = child
            nd.addOutput(nd.failState.outputSet)

##--------------------------------------------------------------------------------------------------
    def findSubstrings(self, findStr):
        """ Finds all substrings of input which are keywords in the tree"""
        for string in findStr.split(' '):
            string = string.upper()
            print "Finding substrings in ", string
            current = self.root
            j = 0
       
            while j < len(string):
                while True:
                    if current.testTransition(string[j]) == False:
                        current = current.failState
                    else:
                        child = current.getTransition(string[j])
##                      print "before break", child.sid
                        break
                if child != None:
##                  print "in none"
                    current = child
                    if len(child.outputSet) != 0:
                        print j
                        itr = iter(child.outputSet)
                        for keyw in itr:
                            print keyw
                j = j + 1   
                   
   
##---------------------------------------------------------
    def displayTree(self):
        """ It is used to display the tree of keywords. Prints ID of node and value of node"""
        queue = deque()
        for nd in self.root.tranList:
            queue.append(nd)

        while len(queue) !=0:
            node = queue.popleft()
            for nd in node.tranList:
                queue.append(nd)
            print node.sid, node.value
           
               
    def displayOutput(self):
        """ This function displays the outputs at a state"""
        queue = deque()
        for nd in self.root.tranList:
            queue.append(nd)

        while len(queue) !=0:
            node = queue.popleft()
            for nd in node.tranList:
                queue.append(nd)
           
            itr = iter(node.outputSet)
            if len(node.outputSet) !=0:
                print node.sid
            for string in itr:
                print string

if (__name__ == "__main__"):
   
    x = ahoCorasick()
    """ Usage: Create object of ahoCorasick
        to enter keywords use addKeyword("string of keywords")
        then call setFailTransitions (fail function)
        to find substrings of string use findSubstrings"""

##    x.addKeyword("he")
##    x.addKeyword("she")
##    x.addKeyword("his")
##    x.addKeyword("hers")
   
##    x.addKeyword("ATC")
##    x.addKeyword("TC")
   
##    x.displayOutput()
   
####    x.enter("help")
####    x.enter("hi")
   
    x.addKeyword("john jane")
  
       
    x.setFailTransitions()
##    x.findSubstrings("ACGATCTCTCGATC")
    x.findSubstrings("johnjane")
  Comments,   0  Trackbacks
댓글 쓰기