1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
import re def addWord(token, frequencies): count = 0 word = ''.join(token) if word in frequencies: count = frequencies[word] frequencies[word] = count + 1 def getWordFrequencies(text): pattern = re.compile('\w') frequencies = {} token = [] for c in text: if pattern.search(c): token.append(c) elif token: addWord(token, frequencies) token = [] if token: addWord(token, frequencies) return frequencies mamaMia = \ "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" result = getWordFrequencies(mamaMia) for word, freq in result.iteritems(): print freq, "\t", word
Refactorings
No refactoring yet !
Elij
December 8, 2007, December 08, 2007 08:58, permalink
The regex needs some work -- but this does the same as yours...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
import re def getWordFrequencies(text): frequencies = {} for c in re.split('\W+', text): frequencies[c] = (frequencies[c] if frequencies.has_key(c) else 0) + 1 return frequencies mamaMia = \ "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" result = getWordFrequencies(mamaMia) for word, freq in result.iteritems(): print freq, "\t", word
lbolognini
December 8, 2007, December 08, 2007 14:02, permalink
adict.update() overwrites the values so there's no need to check if the w is already in the dict
1 2 3 4 5 6 7 8 9
mamaMia = "Mamma mia, here I go again\ Mamma mia, here I go again\ My my, how can I resist you?\ Mamma mia, does it show again?\ My my, just how much I've missed you" adict = {} for w in mamaMia.split(): adict.update( { w : mamaMia.count(w) } ) print adict
Netferret
February 25, 2008, February 25, 2008 15:40, permalink
This is the best solution I have found.
1 2
// s is the string to check occurances of. alert(s.split("YourWord").length - 1);
Tim
June 29, 2008, June 29, 2008 00:32, permalink
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
import re from collections import defaultdict text = """ Mamma mia, here I go again Mamma mia, here I go again My my, how can I resist you? Mamma mia, does it show again? My my, just how much I've missed you """.strip() histogram = defaultdict(int) for word in re.split("\W+", text): histogram[word.lower()] += 1 template = "%-" + str(max(len(word) for word in histogram.keys())) + "s %s" print "\n".join(template % (word, freq) for word, freq in sorted(histogram.items(), key=lambda x:x[1], reverse=True))
This code counts number of occurrences of each word in a string.
How can I improve it or make it more pythonic?