Jack Levent Jack Levent - 9 months ago 37
Python Question

How to get word frequency using collections.Counter, even when the count is zero?

I am trying to get the counts for frequencies of words occurring in multiple files in a directory and thanks to this answer here I was able to get results for when the word did occur. However, I can't figure out how to also get the results to display when there are 0 occurrences of the word too.

This is the kind of result I want, so I always get results for all specified words, with the specified word in the first row and the count below.

21, 23, 60
4, 0, 8

Here's my current code:

import csv
import copy
import os
import sys
import glob
import string
import fileinput
from collections import Counter

def word_frequency(fileobj, words):
"""Build a Counter of specified words in fileobj"""
# initialise the counter to 0 for each word
ct = Counter(dict((w, 0) for w in words))
file_words = (word for line in fileobj for word in line.split())
filtered_words = (word for word in file_words if word in words)
return Counter(filtered_words)

def count_words_in_dir(dirpath, words, action):
"""For each .txt file in a dir, count the specified words"""
for filepath in glob.iglob(os.path.join(dirpath, '*.txt_out')):
filepath = {}
with open(filepath) as f:
ct = word_frequency(f, words)
action(filepath, ct)

def final_summary(filepath, ct):
words = sorted(ct.keys())
counts = [str(ct[k]) for k in words]
with open('new.csv','a') as f:
', '.join(words),
', '.join(counts)))]

words = set(['21','23','60','75','79','86','107','121','147','193','194','197','198','199','200','201','229','241','263','267','309','328'])
count_words_in_dir('C:\\Users\jllevent\Documents\PE Submsissions\Post-CLI', words, action=final_summary)


You are never using the ct Counter you constructed in word_frequency but constructing a new Counter that only has the existing words, you need to use your constructed ct, e.g.:

for word in file_words:
    if word in words:
        ct[word] += 1
return ct

Or as pointed out by @ShadowRanger below:

ct.update(word for word in file_words if word in words)
return ct