mydict = {"&y":"\033[0;30m",
"&c":"\033[0;31m",
"&b":"\033[0;32m",
"&Y":"\033[0;33m",
"&u":"\033[0;34m"}
mystr = "The &yquick &cbrown &bfox &Yjumps over the &ulazy dog"
for k, v in mydict.iteritems():
mystr = mystr.replace(k, v)
print mystr
The ←[0;30mquick ←[0;31mbrown ←[0;32mfox ←[0;33mjumps over the ←[0;34mlazy dog
I took the liberty of comparing a few solutions:
mydict = dict([('&' + chr(i), str(i)) for i in list(range(65, 91)) + list(range(97, 123))])
# random inserts between keys
from random import randint
rawstr = ''.join(mydict.keys())
mystr = ''
for i in range(0, len(rawstr), 2):
mystr += chr(randint(65,91)) * randint(0,20) # insert between 0 and 20 chars
from time import time
# How many times to run each solution
rep = 10000
print 'Running %d times with string length %d and ' \
'random inserts of lengths 0-20' % (rep, len(mystr))
# My solution
t = time()
for x in range(rep):
for k, v in mydict.items():
mystr.replace(k, v)
#print(mystr)
print '%-30s' % 'Tor fixed & variable dict', time()-t
from re import sub, compile, escape
# Peter Hansen
t = time()
for x in range(rep):
sub(r'(&[a-zA-Z])', r'%(\1)s', mystr) % mydict
print '%-30s' % 'Peter fixed & variable dict', time()-t
# Claudiu
def multiple_replace(dict, text):
# Create a regular expression from the dictionary keys
regex = compile("(%s)" % "|".join(map(escape, dict.keys())))
# For each match, look-up corresponding value in dictionary
return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
t = time()
for x in range(rep):
multiple_replace(mydict, mystr)
print '%-30s' % 'Claudio variable dict', time()-t
# Claudiu - Precompiled
regex = compile("(%s)" % "|".join(map(escape, mydict.keys())))
t = time()
for x in range(rep):
regex.sub(lambda mo: mydict[mo.string[mo.start():mo.end()]], mystr)
print '%-30s' % 'Claudio fixed dict', time()-t
# Andrew Y - variable dict
def mysubst(somestr, somedict):
subs = somestr.split("&")
return subs[0] + "".join(map(lambda arg: somedict["&" + arg[0:1]] + arg[1:], subs[1:]))
t = time()
for x in range(rep):
mysubst(mystr, mydict)
print '%-30s' % 'Andrew Y variable dict', time()-t
# Andrew Y - fixed
def repl(s):
return mydict["&"+s[0:1]] + s[1:]
t = time()
for x in range(rep):
subs = mystr.split("&")
res = subs[0] + "".join(map(repl, subs[1:]))
print '%-30s' % 'Andrew Y fixed dict', time()-t
Results in Python 2.6
Running 10000 times with string length 490 and random inserts of lengths 0-20
Tor fixed & variable dict 1.04699993134
Peter fixed & variable dict 0.218999862671
Claudio variable dict 2.48400020599
Claudio fixed dict 0.0940001010895
Andrew Y variable dict 0.0309998989105
Andrew Y fixed dict 0.0310001373291
Both claudiu's and andrew's solutions kept going into 0, so I had to increase it to 10 000 runs.
I ran it in Python 3 (because of unicode) with replacements of chars from 39 to 1024 (38 is ampersand, so I didn't wanna include it). String length up to 10.000 including about 980 replacements with variable random inserts of length 0-20. The unicode values from 39 to 1024 causes characters of both 1 and 2 bytes length, which could affect some solutions.
mydict = dict([('&' + chr(i), str(i)) for i in range(39,1024)])
# random inserts between keys
from random import randint
rawstr = ''.join(mydict.keys())
mystr = ''
for i in range(0, len(rawstr), 2):
mystr += chr(randint(65,91)) * randint(0,20) # insert between 0 and 20 chars
from time import time
# How many times to run each solution
rep = 10000
print('Running %d times with string length %d and ' \
'random inserts of lengths 0-20' % (rep, len(mystr)))
# Tor Valamo - too long
#t = time()
#for x in range(rep):
# for k, v in mydict.items():
# mystr.replace(k, v)
#print('%-30s' % 'Tor fixed & variable dict', time()-t)
from re import sub, compile, escape
# Peter Hansen
t = time()
for x in range(rep):
sub(r'(&[a-zA-Z])', r'%(\1)s', mystr) % mydict
print('%-30s' % 'Peter fixed & variable dict', time()-t)
# Peter 2
def dictsub(m):
return mydict[m.group()]
t = time()
for x in range(rep):
sub(r'(&[a-zA-Z])', dictsub, mystr)
print('%-30s' % 'Peter fixed dict', time()-t)
# Claudiu - too long
#def multiple_replace(dict, text):
# # Create a regular expression from the dictionary keys
# regex = compile("(%s)" % "|".join(map(escape, dict.keys())))
#
# # For each match, look-up corresponding value in dictionary
# return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
#
#t = time()
#for x in range(rep):
# multiple_replace(mydict, mystr)
#print('%-30s' % 'Claudio variable dict', time()-t)
# Claudiu - Precompiled
regex = compile("(%s)" % "|".join(map(escape, mydict.keys())))
t = time()
for x in range(rep):
regex.sub(lambda mo: mydict[mo.string[mo.start():mo.end()]], mystr)
print('%-30s' % 'Claudio fixed dict', time()-t)
# Separate setup for Andrew and gnibbler optimized dict
mydict = dict((k[1], v) for k, v in mydict.items())
# Andrew Y - variable dict
def mysubst(somestr, somedict):
subs = somestr.split("&")
return subs[0] + "".join(map(lambda arg: somedict[arg[0:1]] + arg[1:], subs[1:]))
def mysubst2(somestr, somedict):
subs = somestr.split("&")
return subs[0].join(map(lambda arg: somedict[arg[0:1]] + arg[1:], subs[1:]))
t = time()
for x in range(rep):
mysubst(mystr, mydict)
print('%-30s' % 'Andrew Y variable dict', time()-t)
t = time()
for x in range(rep):
mysubst2(mystr, mydict)
print('%-30s' % 'Andrew Y variable dict 2', time()-t)
# Andrew Y - fixed
def repl(s):
return mydict[s[0:1]] + s[1:]
t = time()
for x in range(rep):
subs = mystr.split("&")
res = subs[0] + "".join(map(repl, subs[1:]))
print('%-30s' % 'Andrew Y fixed dict', time()-t)
# gnibbler
t = time()
for x in range(rep):
myparts = mystr.split("&")
myparts[1:]=[mydict[x[0]]+x[1:] for x in myparts[1:]]
"".join(myparts)
print('%-30s' % 'gnibbler fixed & variable dict', time()-t)
Results:
Running 10000 times with string length 9491 and random inserts of lengths 0-20
Tor fixed & variable dict 0.0 # disqualified 329 secs
Peter fixed & variable dict 2.07799983025
Peter fixed dict 1.53100013733
Claudio variable dict 0.0 # disqualified, 37 secs
Claudio fixed dict 1.5
Andrew Y variable dict 0.578000068665
Andrew Y variable dict 2 0.56299996376
Andrew Y fixed dict 0.56200003624
gnibbler fixed & variable dict 0.530999898911
(** Note that gnibbler's code uses a different dict, where keys don't have the '&' included. Andrew's code also uses this alternate dict, but it didn't make much of a difference, maybe just 0.01x speedup.)
TLDR
Use this method if you want the fastest regex-based solution. For a dataset similar to the OP's, it's approximately 1000 times faster than the accepted answer.
If you don't care about regex, use this set-based version, which is 2000 times faster than a regex union.
Optimized Regex with Trie
A simple Regex union approach becomes slow with many banned words, because the regex engine doesn't do a very good job of optimizing the pattern.
It's possible to create a Trie with all the banned words and write the corresponding regex. The resulting trie or regex aren't really human-readable, but they do allow for very fast lookup and match.
Example
['foobar', 'foobah', 'fooxar', 'foozap', 'fooza']
The list is converted to a trie:
{
'f': {
'o': {
'o': {
'x': {
'a': {
'r': {
'': 1
}
}
},
'b': {
'a': {
'r': {
'': 1
},
'h': {
'': 1
}
}
},
'z': {
'a': {
'': 1,
'p': {
'': 1
}
}
}
}
}
}
}
And then to this regex pattern:
r"\bfoo(?:ba[hr]|xar|zap?)\b"
The huge advantage is that to test if zoo
matches, the regex engine only needs to compare the first character (it doesn't match), instead of trying the 5 words. It's a preprocess overkill for 5 words, but it shows promising results for many thousand words.
Note that (?:)
non-capturing groups are used because:
Code
Here's a slightly modified gist, which we can use as a trie.py
library:
import re
class Trie():
"""Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
The corresponding Regex should match much faster than a simple Regex union."""
def __init__(self):
self.data = {}
def add(self, word):
ref = self.data
for char in word:
ref[char] = char in ref and ref[char] or {}
ref = ref[char]
ref[''] = 1
def dump(self):
return self.data
def quote(self, char):
return re.escape(char)
def _pattern(self, pData):
data = pData
if "" in data and len(data.keys()) == 1:
return None
alt = []
cc = []
q = 0
for char in sorted(data.keys()):
if isinstance(data[char], dict):
try:
recurse = self._pattern(data[char])
alt.append(self.quote(char) + recurse)
except:
cc.append(self.quote(char))
else:
q = 1
cconly = not len(alt) > 0
if len(cc) > 0:
if len(cc) == 1:
alt.append(cc[0])
else:
alt.append('[' + ''.join(cc) + ']')
if len(alt) == 1:
result = alt[0]
else:
result = "(?:" + "|".join(alt) + ")"
if q:
if cconly:
result += "?"
else:
result = "(?:%s)?" % result
return result
def pattern(self):
return self._pattern(self.dump())
Test
Here's a small test (the same as this one):
# Encoding: utf-8
import re
import timeit
import random
from trie import Trie
with open('/usr/share/dict/american-english') as wordbook:
banned_words = [word.strip().lower() for word in wordbook]
random.shuffle(banned_words)
test_words = [
("Surely not a word", "#surely_NöTäWORD_so_regex_engine_can_return_fast"),
("First word", banned_words[0]),
("Last word", banned_words[-1]),
("Almost a word", "couldbeaword")
]
def trie_regex_from_words(words):
trie = Trie()
for word in words:
trie.add(word)
return re.compile(r"\b" + trie.pattern() + r"\b", re.IGNORECASE)
def find(word):
def fun():
return union.match(word)
return fun
for exp in range(1, 6):
print("\nTrieRegex of %d words" % 10**exp)
union = trie_regex_from_words(banned_words[:10**exp])
for description, test_word in test_words:
time = timeit.timeit(find(test_word), number=1000) * 1000
print(" %s : %.1fms" % (description, time))
It outputs:
TrieRegex of 10 words
Surely not a word : 0.3ms
First word : 0.4ms
Last word : 0.5ms
Almost a word : 0.5ms
TrieRegex of 100 words
Surely not a word : 0.3ms
First word : 0.5ms
Last word : 0.9ms
Almost a word : 0.6ms
TrieRegex of 1000 words
Surely not a word : 0.3ms
First word : 0.7ms
Last word : 0.9ms
Almost a word : 1.1ms
TrieRegex of 10000 words
Surely not a word : 0.1ms
First word : 1.0ms
Last word : 1.2ms
Almost a word : 1.2ms
TrieRegex of 100000 words
Surely not a word : 0.3ms
First word : 1.2ms
Last word : 0.9ms
Almost a word : 1.6ms
For info, the regex begins like this:
(?:a(?:(?:\'s|a(?:\'s|chen|liyah(?:\'s)?|r(?:dvark(?:(?:\'s|s))?|on))|b(?:\'s|a(?:c(?:us(?:(?:\'s|es))?|[ik])|ft|lone(?:(?:\'s|s))?|ndon(?:(?:ed|ing|ment(?:\'s)?|s))?|s(?:e(?:(?:ment(?:\'s)?|[ds]))?|h(?:(?:e[ds]|ing))?|ing)|t(?:e(?:(?:ment(?:\'s)?|[ds]))?|ing|toir(?:(?:\'s|s))?))|b(?:as(?:id)?|e(?:ss(?:(?:\'s|es))?|y(?:(?:\'s|s))?)|ot(?:(?:\'s|t(?:\'s)?|s))?|reviat(?:e[ds]?|i(?:ng|on(?:(?:\'s|s))?))|y(?:\'s)?|\é(?:(?:\'s|s))?)|d(?:icat(?:e[ds]?|i(?:ng|on(?:(?:\'s|s))?))|om(?:en(?:(?:\'s|s))?|inal)|u(?:ct(?:(?:ed|i(?:ng|on(?:(?:\'s|s))?)|or(?:(?:\'s|s))?|s))?|l(?:\'s)?))|e(?:(?:\'s|am|l(?:(?:\'s|ard|son(?:\'s)?))?|r(?:deen(?:\'s)?|nathy(?:\'s)?|ra(?:nt|tion(?:(?:\'s|s))?))|t(?:(?:t(?:e(?:r(?:(?:\'s|s))?|d)|ing|or(?:(?:\'s|s))?)|s))?|yance(?:\'s)?|d))?|hor(?:(?:r(?:e(?:n(?:ce(?:\'s)?|t)|d)|ing)|s))?|i(?:d(?:e[ds]?|ing|jan(?:\'s)?)|gail|l(?:ene|it(?:ies|y(?:\'s)?)))|j(?:ect(?:ly)?|ur(?:ation(?:(?:\'s|s))?|e[ds]?|ing))|l(?:a(?:tive(?:(?:\'s|s))?|ze)|e(?:(?:st|r))?|oom|ution(?:(?:\'s|s))?|y)|m\'s|n(?:e(?:gat(?:e[ds]?|i(?:ng|on(?:\'s)?))|r(?:\'s)?)|ormal(?:(?:it(?:ies|y(?:\'s)?)|ly))?)|o(?:ard|de(?:(?:\'s|s))?|li(?:sh(?:(?:e[ds]|ing))?|tion(?:(?:\'s|ist(?:(?:\'s|s))?))?)|mina(?:bl[ey]|t(?:e[ds]?|i(?:ng|on(?:(?:\'s|s))?)))|r(?:igin(?:al(?:(?:\'s|s))?|e(?:(?:\'s|s))?)|t(?:(?:ed|i(?:ng|on(?:(?:\'s|ist(?:(?:\'s|s))?|s))?|ve)|s))?)|u(?:nd(?:(?:ed|ing|s))?|t)|ve(?:(?:\'s|board))?)|r(?:a(?:cadabra(?:\'s)?|d(?:e[ds]?|ing)|ham(?:\'s)?|m(?:(?:\'s|s))?|si(?:on(?:(?:\'s|s))?|ve(?:(?:\'s|ly|ness(?:\'s)?|s))?))|east|idg(?:e(?:(?:ment(?:(?:\'s|s))?|[ds]))?|ing|ment(?:(?:\'s|s))?)|o(?:ad|gat(?:e[ds]?|i(?:ng|on(?:(?:\'s|s))?)))|upt(?:(?:e(?:st|r)|ly|ness(?:\'s)?))?)|s(?:alom|c(?:ess(?:(?:\'s|e[ds]|ing))?|issa(?:(?:\'s|[es]))?|ond(?:(?:ed|ing|s))?)|en(?:ce(?:(?:\'s|s))?|t(?:(?:e(?:e(?:(?:\'s|ism(?:\'s)?|s))?|d)|ing|ly|s))?)|inth(?:(?:\'s|e(?:\'s)?))?|o(?:l(?:ut(?:e(?:(?:\'s|ly|st?))?|i(?:on(?:\'s)?|sm(?:\'s)?))|v(?:e[ds]?|ing))|r(?:b(?:(?:e(?:n(?:cy(?:\'s)?|t(?:(?:\'s|s))?)|d)|ing|s))?|pti...
It's really unreadable, but for a list of 100000 banned words, this Trie regex is 1000 times faster than a simple regex union!
Here's a diagram of the complete trie, exported with trie-python-graphviz and graphviz twopi
:
Best Answer
The standard method is to use the built-in
Incidentally the reason for the performance difference between your versions is that each replacement in your first version causes the entire string to be recopied. Copies are fast, but when you're copying 10 MB at a go, enough copies will become slow.