1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
import sys #input into this structure words = {} #strings generated go here gen = [] #input word word = "" #final output final = [] #first arg = word to parse, second arg = wordlist name, optional, third arg = (generated word) minimum length, optional arglength = len(sys.argv) if arglength > 1: word = sys.argv[1] else: print "Please specify an input file" quit() if arglength > 2: list = sys.argv[2] else: print "Using default wordlist.txt" list = "wordlist.txt" if arglength > 3: cutoff = int(sys.argv[3]) else: print "Using default cutoff of 1 letter" cutoff = 1 #create output file based on the word we're generating outFile = "output-" + word + ".txt" out = open(outFile, 'w') #create input file f = open(list,'r') #file is one word per line, strip out the endline character, and throw in a dictionary for line in f: line = line.strip() if len(line) > cutoff: words[line] = 1 #recurse calculates all the substrings forward, is exponential #input: current string, base string #example: spray => recurse("s","pray"), recurse("","pray") def recurse(current, string): if string == "": return gen.append(current) gen.append(str(current + string[:1])) recurse(current, string[1:]) recurse(str(current + string[:1]), string[1:]) #check the generated strings for words #input: string list, word list def checkOverlap(gen, words): for item in gen: #these are just strings, so lets check if they're words if item in words: #ignoring duplicates as we go if item not in final: final.append(item) #start recursion on the input word recurse("", word) #given the generated strings, find the words checkOverlap(gen, words) #sort the output final.sort() #write the output for item in final: out.write(item) out.write('\n') print "Output written to " + outFile
Refactorings
No refactoring yet !
nosklo
January 20, 2009, January 20, 2009 14:28, permalink
Using optparse to parse options.
Using a set to store words.
Using generators and list comprehensions.
Usage example:
$ python williams.py doubt
do
dot
doubt
dub
out
$ python williams.py -h
Usage: williams.py [options] words...
Options:
-h, --help show this help message and exit
-w FILE, --wordlist=FILE
wordlist filename, default wordlist.txt
-m LENGTH, --minlenght=LENGTH
minimum length of generated word, default 1
-o FILE, --output=FILE
Output to FILE. By default output to stdout.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
import sys import optparse # parse command line options: parser = optparse.OptionParser('%prog [options] words...') parser.add_option("-w", "--wordlist", dest="words", default='wordlist.txt', help="wordlist filename, default wordlist.txt", metavar="FILE") parser.add_option("-m", "--minlenght", dest="cutoff", default=1, type='int', help="minimum length of generated word, default 1", metavar="LENGTH") parser.add_option("-o", "--output", dest="outputfile", help="Output to FILE. By default output to stdout.", metavar="FILE") options, words = parser.parse_args() # read wordlist: wordlist = set(word.strip() for word in open(options.words) if len(word.strip()) > options.cutoff) # open output file if options.outputfile: out = open(options.outputfile, 'w') else: out = sys.stdout def recurse(current, base): """ recurse calculates all the substrings forward, is exponential input: current string, base string example: spray => recurse("s","pray"), recurse("","pray") """ result = set() if base: result.add(current) result.add(current + base[:1]) result.update(recurse(current, base[1:])) result.update(recurse(current + base[:1], base[1:])) return result for word in words: out.writelines(sorted('%s\n' % item for item in recurse('', word) if item in wordlist))
cmurphycode.blogspot.com
February 20, 2009, February 20, 2009 03:21, permalink
For some reason this failed to notify me. Nice stuff! I hadn't seen optparse before, I like it a lot. Using a set is a good idea and generators are always grand. Thanks!
cmurphycode.blogspot.com
April 27, 2010, April 27, 2010 20:35, permalink
Oh My. A long journey took me to this post, which I realized I never updated. Here's what I had written a few days after the initial post to optimize for longer words...still awful code, but prefix pruning allows somewhat better than exponential behavior.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
import sys #input into this structure words = {} wordList = [] #strings generated go here gen = [] #input word word = "" #final output final = [] #output file outFile = "" def recurse(current, string, prefixList): ''' calculates all the substrings forward (cannot backtrack) Input: current string, base string Example: spray => recurse("s","pray"), recurse("","pray") (exponential)''' #base case: no more base string to process if string == "": return #if the current string is not a prefix for a word, might as well quit if len(current) < len(prefixList): if current not in prefixList[len(current)]: return #recursive case: we create two new threads of processing, one with the first character in the base string, #and one without, thus we will hit all possible strings that can be generated forward #note: this is different than just mixing all of the letters together however you please #So here we add the two cases to our generated strings list gen.append(current) gen.append(str(current + string[:1])) #then recurse on the two recurse(current, string[1:],prefixList) recurse(str(current + string[:1]), string[1:],prefixList) def checkOverlap(gen, words): '''check the generated strings for words input: string list, word list''' for item in gen: #these are just strings, so lets check if they're words if item in words: #ignoring duplicates as we go if item not in final: final.append(item) def generatePrefixes(words): prefixList = [{},{},{},{},{},{},{}] for word in words: for i in range(7): prefixList[i][word[:i]] = 1 return prefixList if __name__ == "__main__": #first arg = word to parse, second arg = wordlist name, optional, third arg = (generated word) minimum length, optional arglength = len(sys.argv) if arglength > 1: word = sys.argv[1] else: print "Please specify an input word" quit() if arglength > 2: list = sys.argv[2] else: print "Using default wordlist.txt" list = "wordlist.txt" if arglength > 3: cutoff = int(sys.argv[3]) else: print "Using default cutoff of 1 letter" cutoff = 1 #create output file based on the word we're generating outFile = "pruned-output-" + word + ".txt" out = open(outFile, 'w') #create input file f = open(list,'r') #file is one word per line, strip out the endline character, and throw in a dictionary for line in f: line = line.strip() if len(line) > cutoff: words[line] = 1 wordList.append(line) prefixList = generatePrefixes(wordList) #start recursion on the input word recurse("", word, prefixList) #given the generated strings, find the words checkOverlap(gen, words) #sort the output final.sort() #write the output for item in final: out.write(item) out.write('\n') print "Output written to " + outFile
For my blog: http://www.cmurphycode.blogspot.com
A little project via Kottke (http://www.kottke.org/08/11/williams-poems), that I thought would be easy and fun. No need to actually refactor unless you're truly bored- I'm just experimenting with using this as a place to share code.