2155c92be66863c4634778bf522efe14

This code counts number of occurrences of each word in a string.

How can I improve it or make it more pythonic?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import re

def addWord(token, frequencies):
    count = 0
    word = ''.join(token)
    if word in frequencies:
        count = frequencies[word]
    frequencies[word] = count + 1

def getWordFrequencies(text):
    pattern = re.compile('\w')
    frequencies = {}
    token = []
    
    for c in text:
        if pattern.search(c):
            token.append(c)
        elif token:
            addWord(token, frequencies)
            token = []
    
    if token:
        addWord(token, frequencies)

    return frequencies



mamaMia = \
"Mamma mia, here I go again\
Mamma mia, here I go again\
My my, how can I resist you?\
Mamma mia, does it show again?\
My my, just how much I've missed you"

result = getWordFrequencies(mamaMia)

for word, freq in result.iteritems():
    print freq, "\t", word

Refactorings

No refactoring yet !

4d72203c38dd5f3e3d2d446b5888e8a7

Elij

December 8, 2007, December 08, 2007 08:58, permalink

4 ratings. Login to rate!

The regex needs some work -- but this does the same as yours...

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import re

def getWordFrequencies(text):
    frequencies = {}
    
    for c in re.split('\W+', text):
        frequencies[c] = (frequencies[c] if frequencies.has_key(c) else 0) + 1
            
    return frequencies


mamaMia = \
"Mamma mia, here I go again\
Mamma mia, here I go again\
My my, how can I resist you?\
Mamma mia, does it show again?\
My my, just how much I've missed you"

result = getWordFrequencies(mamaMia)

for word, freq in result.iteritems():
    print freq, "\t", word
D2ff155cd04fa175620d2f3495b11b08

lbolognini

December 8, 2007, December 08, 2007 14:02, permalink

6 ratings. Login to rate!

adict.update() overwrites the values so there's no need to check if the w is already in the dict

1
2
3
4
5
6
7
8
9
mamaMia = "Mamma mia, here I go again\
 Mamma mia, here I go again\
 My my, how can I resist you?\
 Mamma mia, does it show again?\
 My my, just how much I've missed you"

adict = {}
for w in mamaMia.split(): adict.update( { w : mamaMia.count(w) } ) 
print adict
762e6a51b13c6357f178e65b19392e09

Netferret

February 25, 2008, February 25, 2008 15:40, permalink

1 rating. Login to rate!

This is the best solution I have found.

1
2
// s is the string to check occurances of.
alert(s.split("YourWord").length - 1);
Avatar

Tim

June 29, 2008, June 29, 2008 00:32, permalink

3 ratings. Login to rate!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import re
from collections import defaultdict

text = """
Mamma mia, here I go again
Mamma mia, here I go again
My my, how can I resist you?
Mamma mia, does it show again?
My my, just how much I've missed you
""".strip()

histogram = defaultdict(int)
for word in re.split("\W+", text):
	histogram[word.lower()] += 1

template = "%-" + str(max(len(word) for word in histogram.keys())) + "s  %s"
print "\n".join(template % (word, freq) for word, freq in 
				sorted(histogram.items(), key=lambda x:x[1], reverse=True))
80249086b1bd0d24c2307fe787b55e91

Tom

July 27, 2008, July 27, 2008 03:48, permalink

1 rating. Login to rate!

As per lbolognini, but no variable required.

1
2
3
4
5
6
7
8
9
mamaMia = "Mamma mia, here I go again\
 Mamma mia, here I go again\
 My my, how can I resist you?\
 Mamma mia, does it show again?\
 My my, just how much I've missed you"

result = dict([(w, mamaMia.count(w)) for w in mamaMia.split()])

for i in result.items(): print "%s\t%d"%i
0ec929f9b2472896c4c8eb97d457a10d

Walter Cruz

July 28, 2008, July 28, 2008 12:46, permalink

2 ratings. Login to rate!

What about shlex and itertools?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import shlex
from itertools import groupby

def ilen(it):
    for i, _ in enumerate(it): pass
    return i+1

def getWordFrequencies(text):

    lexer = shlex.shlex(text, posix=False)
    lexer.whitespace = lexer.whitespace + "?,'"
    i = sorted(list(lexer))

    l = ((ilen(g), k) for k, g in groupby(i))
    for it in l:
        yield it

mamaMia = \
"Mamma mia, here I go again\
Mamma mia, here I go again\
My my, how can I resist you?\
Mamma mia, does it show again?\
My my, just how much I've missed you"

for freq , word  in getWordFrequencies(mamaMia):
    print freq, "\t", word
264124475f095b65634c53da3380b88d

Leif Ryge

October 31, 2008, October 31, 2008 19:58, permalink

No rating. Login to rate!

This prints not only the count but also the locations of each word.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def mk_index(seq):
    """Index a sequence
    >>> sorted(mk_index("abcba").items())
    [('a', [0, 4]), ('b', [1, 3]), ('c', [2])]"""
    result={}
    for location, item in enumerate(seq):
        result.setdefault(item,[]).append(location)
    return result

def printWordFrequencyAndLocationReport(text):
    """Report count and locations of each word in text
    >>> printWordFrequencyAndLocationReport('Ob la di ob la da')
    ob occurred 2 times [0, 3]
    la occurred 2 times [1, 4]
    da occurred 1 time [5]
    di occurred 1 time [2]"""
    for word, locs in sorted(mk_index(text.lower().split()).items(),
                             key=lambda (w,l): len(l), reverse=True):
        print "%s occurred %s time%s %s" % (word, len(locs),
                                            ('','s')[len(locs)>1], locs)

import doctest
doctest.testmod(verbose=True)
264124475f095b65634c53da3380b88d

Leif Ryge

October 31, 2008, October 31, 2008 20:37, permalink

No rating. Login to rate!

Here is another version which uses the handy itertools.groupby function.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from itertools import groupby
import doctest

def printWordFrequencies(text):
    """
    >>> printWordFrequencies("Ob la di ob la da")
    1 da
    1 di
    2 la
    2 ob"""
    for w, g in groupby(sorted(text.lower().split())):
        print "%s %s" % (len(list(g)), w)

doctest.testmod(verbose=True)
Eead9f6e9e61907ec68459cbed598454

Suganya

January 19, 2010, January 19, 2010 09:23, permalink

No rating. Login to rate!
1
i wanted the code to count the no of occurences of each word in a text using java script .!! pls anyone help !!!

Your refactoring





Format Copy from initial code

or Cancel