1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
import re def addWord(token, frequencies): count = 0 word = ''.join(token) if word in frequencies: count = frequencies[word] frequencies[word] = count + 1 def getWordFrequencies(text): pattern = re.compile('\w') frequencies = {} token = [] for c in text: if pattern.search(c): token.append(c) elif token: addWord(token, frequencies) token = [] if token: addWord(token, frequencies) return frequencies mamaMia = \ "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" result = getWordFrequencies(mamaMia) for word, freq in result.iteritems(): print freq, "\t", word
Refactorings
No refactoring yet !
Elij
December 8, 2007, December 08, 2007 08:58, permalink
The regex needs some work -- but this does the same as yours...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
import re def getWordFrequencies(text): frequencies = {} for c in re.split('\W+', text): frequencies[c] = (frequencies[c] if frequencies.has_key(c) else 0) + 1 return frequencies mamaMia = \ "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" result = getWordFrequencies(mamaMia) for word, freq in result.iteritems(): print freq, "\t", word
lbolognini
December 8, 2007, December 08, 2007 14:02, permalink
adict.update() overwrites the values so there's no need to check if the w is already in the dict
1 2 3 4 5 6 7 8 9
mamaMia = "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" adict = {} for w in mamaMia.split(): adict.update( { w : mamaMia.count(w) } ) print adict
Netferret
February 25, 2008, February 25, 2008 15:40, permalink
This is the best solution I have found.
1 2
// s is the string to check occurances of. alert(s.split("YourWord").length - 1);
Tim
June 29, 2008, June 29, 2008 00:32, permalink
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
import re from collections import defaultdict text = """ Mamma mia, here I go again Mamma mia, here I go again My my, how can I resist you? Mamma mia, does it show again? My my, just how much I've missed you """.strip() histogram = defaultdict(int) for word in re.split("\W+", text): histogram[word.lower()] += 1 template = "%-" + str(max(len(word) for word in histogram.keys())) + "s %s" print "\n".join(template % (word, freq) for word, freq in sorted(histogram.items(), key=lambda x:x[1], reverse=True))
Tom
July 27, 2008, July 27, 2008 03:48, permalink
As per lbolognini, but no variable required.
1 2 3 4 5 6 7 8 9
mamaMia = "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" result = dict([(w, mamaMia.count(w)) for w in mamaMia.split()]) for i in result.items(): print "%s\t%d"%i
Walter Cruz
July 28, 2008, July 28, 2008 12:46, permalink
What about shlex and itertools?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
import shlex from itertools import groupby def ilen(it): for i, _ in enumerate(it): pass return i+1 def getWordFrequencies(text): lexer = shlex.shlex(text, posix=False) lexer.whitespace = lexer.whitespace + "?,'" i = sorted(list(lexer)) l = ((ilen(g), k) for k, g in groupby(i)) for it in l: yield it mamaMia = \ "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" for freq , word in getWordFrequencies(mamaMia): print freq, "\t", word
Leif Ryge
October 31, 2008, October 31, 2008 19:58, permalink
This prints not only the count but also the locations of each word.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
def mk_index(seq): """Index a sequence >>> sorted(mk_index("abcba").items()) [('a', [0, 4]), ('b', [1, 3]), ('c', [2])]""" result={} for location, item in enumerate(seq): result.setdefault(item,[]).append(location) return result def printWordFrequencyAndLocationReport(text): """Report count and locations of each word in text >>> printWordFrequencyAndLocationReport('Ob la di ob la da') ob occurred 2 times [0, 3] la occurred 2 times [1, 4] da occurred 1 time [5] di occurred 1 time [2]""" for word, locs in sorted(mk_index(text.lower().split()).items(), key=lambda (w,l): len(l), reverse=True): print "%s occurred %s time%s %s" % (word, len(locs), ('','s')[len(locs)>1], locs) import doctest doctest.testmod(verbose=True)
Leif Ryge
October 31, 2008, October 31, 2008 20:37, permalink
Here is another version which uses the handy itertools.groupby function.
1 2 3 4 5 6 7 8 9 10 11 12 13 14
from itertools import groupby import doctest def printWordFrequencies(text): """ >>> printWordFrequencies("Ob la di ob la da") 1 da 1 di 2 la 2 ob""" for w, g in groupby(sorted(text.lower().split())): print "%s %s" % (len(list(g)), w) doctest.testmod(verbose=True)
Suganya
January 19, 2010, January 19, 2010 09:23, permalink
1
i wanted the code to count the no of occurences of each word in a text using java script .!! pls anyone help !!!
This code counts number of occurrences of each word in a string.
How can I improve it or make it more pythonic?