Techie Techie - 4 months ago 8
Python Question

Count number of occurrences of a given list of strings in a data stream

I have a stream of data something like,

stream = "carracecowtenhihellocohiwcar ......"


and i have to get no. of occurrence of all words in the list from the stream

words = ["car", "cow", "hi", ....]


So, result would be something like

result = {
"car": 2,
"cow": 1,
"hi": 2,
....
....
}


with my current implementation, I am iterating through the list of words and add them to dict as below,

I am looking for some better way to do it, as list of words keep on increasing and data from stream in continuous.

This is what I have currently,

import re
def word_count(stream_obj):

mydict = {}
words = ["car", "cow", "hi", "hello"]
max_word_len = len(max(words, key=len))
regex = re.compile("|".join(words))
last_chunk_remainder = ""

while(stream_obj.getchunk() is not None):
stream_data = last_chunk_remainder + stream_obj.getchunk()
for word in words:
mydict[word] = stream_data.count(word)

# to handle the corner case like if the stream chunk ends with
# “ca” and first letter of next is "r", so that make the word
# match words in the list, which is "car"
if not regex.findall(stream_data[-max_word_len:]):
last_chunk_remainder = stream_data[-max_word_len:]


Thanks

Answer

I got it working and have tried to cover all know corner cases, will be really thankful if you can propose some suggestions/improvements, Thanks for help, and sorry for initial incomplete question.

import re
from collections import defaultdict

WORD_COUNTS = defaultdict(int)
WORDS = ["car", "cat", "cow", "hi", "hello"]
MAX_WORD_LEN = len(max(WORDS, key=len))
REGEX = ("|".join(WORDS))
RE_OBJ = re.compile(REGEX)

def count_words(stream):
    last_stream_remainder = ""

    while True:
        data = stream.get_chunk()

        # Breaking point 
        if data is None:
            break

        if not data:
            continue

        data = last_stream_remainder + data
        for match in RE_OBJ.finditer(data):
            WORD_COUNTS[match.group(0)] += 1

        # to cover the corner case like remainder from last 
        # chunk can attach with new one and make a word
        if match:
            if match.end() >= len(data):
                continue
            else:
                last_match = min((len(data) - match.end()), MAX_WORD_LEN)
                last_stream_remainder = data[-last_match:]
        else:
            last_stream_remainder = data[-MAX_WORD_LEN:]

class StreamReader(object):
    STREAM_DATA = ["car1cat1lftrysomecow1shi1iamgoinghello1pleasegoocar2sarehere",
                   "car3car4car5cat2cat3h", "i2thisishello2hello3he", "", "llo4", None]

    def get_chunk(self):
        return self.STREAM_DATA.pop(0)

stream = StreamReader()
count_words(stream)

print WORD_COUNTS.items()
# [('car', 5), ('hi', 3), ('hello', 4), ('cow', 1), ('cat', 3)]