dani_anyman dani_anyman - 25 days ago 6
Python Question

python - increase efficiency of large-file search by readlines(size)

I am new to Python and I am currently using Python 2. I have some source files that each consists of a huge amount of data (approx. 19 million lines). It looks like the following:

apple \t N \t apple
n&apos
garden \t N \t garden
b\ta\md
great \t Adj \t great
nice \t Adj \t (unknown)
etc


My task is to search the 3rd column of each file for some target words and every time a target word is found in the corpus the 10 words before and after this word have to be added to a multidimensional dictionary.

EDIT: The lines containing a '&', a '\' or the string '(unknown)' should be excluded.

I tried to solve this using readlines() and enumerate() as you see in the code below. The code does what it should but it is obviously not efficient enough for the amount of data provided in the source file.

I know that readlines() or read() should not be used for huge data sets as it loads the whole file into memory. Nevertheless, reading the file line by line, I did not manage to use the enumerate method to get the 10 words before and after the target word.
I also cannot use mmap as I do not have the permission to use it on that file.

So, I guess the readlines method with some size limitation would be the most efficient solution. However, going for that, would I not make some errors as each time reaching the end of the size limit the 10 words after the target word would not be captured as the code just breaks?

def get_target_to_dict(file):
targets_dict = {}
with open(file) as f:
for line in f:
targets_dict[line.strip()] = {}
return targets_dict

targets_dict = get_target_to_dict('targets_uniq.txt')
# browse directory and process each file
# find the target words to include the 10 words before and after to the dictionary
# exclude lines starting with <,-,; to just have raw text

def get_co_occurence(path_file_dir, targets, results):
lines = []
for file in os.listdir(path_file_dir):
if file.startswith('corpus'):
path_file = os.path.join(path_file_dir, file)
with gzip.open(path_file) as corpusfile:
# PROBLEMATIC CODE HERE
# lines = corpusfile.readlines()
for line in corpusfile:
if re.match('[A-Z]|[a-z]', line):
if '(unknown)' in line:
continue
elif '\\' in line:
continue
elif '&' in line:
continue
lines.append(line)
for i, line in enumerate(lines):
line = line.strip()
if re.match('[A-Z][a-z]', line):
parts = line.split('\t')
lemma = parts[2]
if lemma in targets:
pos = parts[1]
if pos not in targets[lemma]:
targets[lemma][pos] = {}
counts = targets[lemma][pos]
context = []
# look at 10 previous lines
for j in range(max(0, i-10), i):
context.append(lines[j])
# look at the next 10 lines
for j in range(i+1, min(i+11, len(lines))):
context.append(lines[j])
# END OF PROBLEMATIC CODE
for context_line in context:
context_line = context_line.strip()
parts_context = context_line.split('\t')
context_lemma = parts_context[2]
if context_lemma not in counts:
counts[context_lemma] = {}
context_pos = parts_context[1]
if context_pos not in counts[context_lemma]:
counts[context_lemma][context_pos] = 0
counts[context_lemma][context_pos] += 1
csvwriter = csv.writer(results, delimiter='\t')
for k,v in targets.iteritems():
for k2,v2 in v.iteritems():
for k3,v3 in v2.iteritems():
for k4,v4 in v3.iteritems():
csvwriter.writerow([str(k), str(k2), str(k3), str(k4), str(v4)])
#print(str(k) + "\t" + str(k2) + "\t" + str(k3) + "\t" + str(k4) + "\t" + str(v4))

results = open('results_corpus.csv', 'wb')
word_occurrence = get_co_occurence(path_file_dir, targets_dict, results)


I copied the whole part of the code for reasons of completeness as it is all part of one function which creates a multidimensional dictionary out of all information extracted and writes it to a csv file then.

I would really appreciate any hint or suggestion to make this code more efficient.

EDIT I corrected the code, so that it takes into account the exact 10 words before and after the target word

Answer

my idea was to create a buffer to store before 10 lines and another buffer to store after 10 lines, as the file being read, it will be push into before buffer and the buffer will be pop off if size exceed 10

for the after buffer, i clone another iterator from the file iterator 1st. Then running both iterator in parallel within the loop with clone iterator running 10 iteration ahead to get the after 10 lines.

This avoid using readlines() and load whole file in memory. Hope it works for you in actual case

edited: only fill the before after buffer if column 3 does not contains any of '&', '\', '(unknown)'.Also change split('\t') into just split() so it will take care all whitespace or tab

import itertools
excluded_words = ['&', '\\', '(unknown)'] # modify excluded words here
with gzip.open(path_file) as corpusfile:
    # CHANGED CODE HERE
    # lines = corpusfile.readlines()

    before_buf = [] # buffer to store before 10 lines
    after_buf = []  # buffer to store after 10 lines
    corpusfile, corpusfile_clone = itertools.tee(corpusfile) # clone file iterator to access next 10 lines

    # for i, line in enumerate(corpusfile):
    for line in corpusfile: # probably don't need enumerate anymore
        line = line.strip()

        ## original buffer handling, fill buffer regardless what it is
        # if len(before_buf)>10:
        #     before_buf.pop(0) # keep the buffer at size 10
        # before_buf.append(line) # append to before buffer
        # while len(after_buf)<=10:
        #     try:
        #         after_buf.append(next(corpusfile_clone)) # advance 1 iterator
        #     except StopIteration:
        #         pass # copy iterator will exhaust 1st coz its 10 iteration ahead
        # after_buf and after_buf.pop(0) # pop off one ready for next, only pop if list not empty

        if re.match('[A-Z][a-z]', line):
            parts = line.split()
            lemma = parts[2]

            # edited buffer handling, fill buffer excluded line contains any of '&', '\', '(unknown)' 
            if not any(w in lemma for w in excluded_words):
                before_buf.append(line) # append to before buffer
            if len(before_buf)>11:
                before_buf.pop(0) # keep the buffer at size 10
            while len(after_buf)<=10:
                try:
                    after = next(corpusfile_clone) # advance 1 iterator
                    after_lemma = ''
                    if re.match('[A-Z][a-z]', after):
                        after_lemma = after.split()[2]
                    # else: # logic with else continue should works the same to me
                    #     continue
                except StopIteration:
                    break # copy iterator will exhaust 1st coz its 10 iteration ahead
                if after_lemma and not any(w in after_lemma for w in excluded_words):
                    after_buf.append(after) # append to  buffer    
            after_buf and after_buf.pop(0) # pop off one ready for next, only pop if list not empty

            if lemma in targets:
                pos = parts[1]
                if pos not in targets[lemma]:
                    targets[lemma][pos] = {}
                counts = targets[lemma][pos]
                # context = []
                # look at 10 previous lines
                context= before_buf[:-1] # minus out current line

                # look at the next 10 lines
                context.extend(after_buf)

                # END OF CHANGED CODE
                # CONTINUE YOUR STUFF HERE WITH CONTEXT
                # .....

with the edited code, since we already filter unwanted stuff before we fill the buffer, can remove the redundant checking of context while processing it

                for context_line in context:
                    context_line = context_line.strip()
                    if re.match('[A-Z][a-z]', context_line):
                        parts_context = context_line.split('\t')
                        context_lemma = parts_context[2]

                        # remove 6 lines below
                        # if '(unknown)' in context_lemma:
                        #     continue
                        # elif '\\' in context_lemma:
                        #     continue
                        # elif '&' in context_lemma:
                        #     continue