Python – Fastest Method for Search and Replace on a Large String

pythonregex

I'm looking for the fastest way to replace a large number of sub-strings inside a very large string. Here are two examples I've used.

findall() feels simpler and more elegant, but it takes an astounding amount of time.

finditer() blazes through a large file, but I'm not sure this is the right way to do it.

Here's some sample code. Note that the actual text I'm interested in is a single string around 10MB in size, and there's a huge difference in these two methods.

import re

def findall_replace(text, reg, rep):
    for match in reg.findall(text):
        output = text.replace(match, rep)
    return output

def finditer_replace(text, reg, rep):
    cursor_pos = 0
    output = ''
    for match in reg.finditer(text):
        output += "".join([text[cursor_pos:match.start(1)], rep])
        cursor_pos = match.end(1)
    output += "".join([text[cursor_pos:]])
    return output

reg = re.compile(r'(dog)')
rep = 'cat'
text = 'dog cat dog cat dog cat'

finditer_replace(text, reg, rep)

findall_replace(text, reg, rep)

UPDATE Added re.sub method to tests:

def sub_replace(reg, rep, text):
    output = re.sub(reg, rep, text)
    return output

Results

re.sub() – 0:00:00.031000
finditer() – 0:00:00.109000
findall() – 0:01:17.260000

Best Answer

The standard method is to use the built-in

re.sub(reg, rep, text)

Incidentally the reason for the performance difference between your versions is that each replacement in your first version causes the entire string to be recopied. Copies are fast, but when you're copying 10 MB at a go, enough copies will become slow.

Related Solutions

Python – How to Perform Mass String Replace Efficiently

mydict = {"&y":"\033[0;30m",
          "&c":"\033[0;31m",
          "&b":"\033[0;32m",
          "&Y":"\033[0;33m",
          "&u":"\033[0;34m"}
mystr = "The &yquick &cbrown &bfox &Yjumps over the &ulazy dog"

for k, v in mydict.iteritems():
    mystr = mystr.replace(k, v)

print mystr
The ←[0;30mquick ←[0;31mbrown ←[0;32mfox ←[0;33mjumps over the ←[0;34mlazy dog

I took the liberty of comparing a few solutions:

mydict = dict([('&' + chr(i), str(i)) for i in list(range(65, 91)) + list(range(97, 123))])

# random inserts between keys
from random import randint
rawstr = ''.join(mydict.keys())
mystr = ''
for i in range(0, len(rawstr), 2):
    mystr += chr(randint(65,91)) * randint(0,20) # insert between 0 and 20 chars

from time import time

# How many times to run each solution
rep = 10000

print 'Running %d times with string length %d and ' \
      'random inserts of lengths 0-20' % (rep, len(mystr))

# My solution
t = time()
for x in range(rep):
    for k, v in mydict.items():
        mystr.replace(k, v)
    #print(mystr)
print '%-30s' % 'Tor fixed & variable dict', time()-t

from re import sub, compile, escape

# Peter Hansen
t = time()
for x in range(rep):
    sub(r'(&[a-zA-Z])', r'%(\1)s', mystr) % mydict
print '%-30s' % 'Peter fixed & variable dict', time()-t

# Claudiu
def multiple_replace(dict, text): 
    # Create a regular expression  from the dictionary keys
    regex = compile("(%s)" % "|".join(map(escape, dict.keys())))

    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

t = time()
for x in range(rep):
    multiple_replace(mydict, mystr)
print '%-30s' % 'Claudio variable dict', time()-t

# Claudiu - Precompiled
regex = compile("(%s)" % "|".join(map(escape, mydict.keys())))

t = time()
for x in range(rep):
    regex.sub(lambda mo: mydict[mo.string[mo.start():mo.end()]], mystr)
print '%-30s' % 'Claudio fixed dict', time()-t

# Andrew Y - variable dict
def mysubst(somestr, somedict):
  subs = somestr.split("&")
  return subs[0] + "".join(map(lambda arg: somedict["&" + arg[0:1]] + arg[1:], subs[1:]))

t = time()
for x in range(rep):
    mysubst(mystr, mydict)
print '%-30s' % 'Andrew Y variable dict', time()-t

# Andrew Y - fixed
def repl(s):
  return mydict["&"+s[0:1]] + s[1:]

t = time()
for x in range(rep):
    subs = mystr.split("&")
    res = subs[0] + "".join(map(repl, subs[1:]))
print '%-30s' % 'Andrew Y fixed dict', time()-t

Results in Python 2.6

Running 10000 times with string length 490 and random inserts of lengths 0-20
Tor fixed & variable dict      1.04699993134
Peter fixed & variable dict    0.218999862671
Claudio variable dict          2.48400020599
Claudio fixed dict             0.0940001010895
Andrew Y variable dict         0.0309998989105
Andrew Y fixed dict            0.0310001373291

Both claudiu's and andrew's solutions kept going into 0, so I had to increase it to 10 000 runs.

I ran it in Python 3 (because of unicode) with replacements of chars from 39 to 1024 (38 is ampersand, so I didn't wanna include it). String length up to 10.000 including about 980 replacements with variable random inserts of length 0-20. The unicode values from 39 to 1024 causes characters of both 1 and 2 bytes length, which could affect some solutions.

mydict = dict([('&' + chr(i), str(i)) for i in range(39,1024)])

# random inserts between keys
from random import randint
rawstr = ''.join(mydict.keys())
mystr = ''
for i in range(0, len(rawstr), 2):
    mystr += chr(randint(65,91)) * randint(0,20) # insert between 0 and 20 chars

from time import time

# How many times to run each solution
rep = 10000

print('Running %d times with string length %d and ' \
      'random inserts of lengths 0-20' % (rep, len(mystr)))

# Tor Valamo - too long
#t = time()
#for x in range(rep):
#    for k, v in mydict.items():
#        mystr.replace(k, v)
#print('%-30s' % 'Tor fixed & variable dict', time()-t)

from re import sub, compile, escape

# Peter Hansen
t = time()
for x in range(rep):
    sub(r'(&[a-zA-Z])', r'%(\1)s', mystr) % mydict
print('%-30s' % 'Peter fixed & variable dict', time()-t)

# Peter 2
def dictsub(m):
    return mydict[m.group()]

t = time()
for x in range(rep):
    sub(r'(&[a-zA-Z])', dictsub, mystr)
print('%-30s' % 'Peter fixed dict', time()-t)

# Claudiu - too long
#def multiple_replace(dict, text): 
#    # Create a regular expression  from the dictionary keys
#    regex = compile("(%s)" % "|".join(map(escape, dict.keys())))
#
#    # For each match, look-up corresponding value in dictionary
#    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
#
#t = time()
#for x in range(rep):
#    multiple_replace(mydict, mystr)
#print('%-30s' % 'Claudio variable dict', time()-t)

# Claudiu - Precompiled
regex = compile("(%s)" % "|".join(map(escape, mydict.keys())))

t = time()
for x in range(rep):
    regex.sub(lambda mo: mydict[mo.string[mo.start():mo.end()]], mystr)
print('%-30s' % 'Claudio fixed dict', time()-t)

# Separate setup for Andrew and gnibbler optimized dict
mydict = dict((k[1], v) for k, v in mydict.items())

# Andrew Y - variable dict
def mysubst(somestr, somedict):
  subs = somestr.split("&")
  return subs[0] + "".join(map(lambda arg: somedict[arg[0:1]] + arg[1:], subs[1:]))

def mysubst2(somestr, somedict):
  subs = somestr.split("&")
  return subs[0].join(map(lambda arg: somedict[arg[0:1]] + arg[1:], subs[1:]))

t = time()
for x in range(rep):
    mysubst(mystr, mydict)
print('%-30s' % 'Andrew Y variable dict', time()-t)
t = time()
for x in range(rep):
    mysubst2(mystr, mydict)
print('%-30s' % 'Andrew Y variable dict 2', time()-t)

# Andrew Y - fixed
def repl(s):
  return mydict[s[0:1]] + s[1:]

t = time()
for x in range(rep):
    subs = mystr.split("&")
    res = subs[0] + "".join(map(repl, subs[1:]))
print('%-30s' % 'Andrew Y fixed dict', time()-t)

# gnibbler
t = time()
for x in range(rep):
    myparts = mystr.split("&")
    myparts[1:]=[mydict[x[0]]+x[1:] for x in myparts[1:]]
    "".join(myparts)
print('%-30s' % 'gnibbler fixed & variable dict', time()-t)

Results:

Running 10000 times with string length 9491 and random inserts of lengths 0-20
Tor fixed & variable dict      0.0 # disqualified 329 secs
Peter fixed & variable dict    2.07799983025
Peter fixed dict               1.53100013733 
Claudio variable dict          0.0 # disqualified, 37 secs
Claudio fixed dict             1.5
Andrew Y variable dict         0.578000068665
Andrew Y variable dict 2       0.56299996376
Andrew Y fixed dict            0.56200003624
gnibbler fixed & variable dict 0.530999898911

(** Note that gnibbler's code uses a different dict, where keys don't have the '&' included. Andrew's code also uses this alternate dict, but it didn't make much of a difference, maybe just 0.01x speedup.)

Speed Up Millions of Regex Replacements in Python 3

TLDR

Use this method if you want the fastest regex-based solution. For a dataset similar to the OP's, it's approximately 1000 times faster than the accepted answer.

If you don't care about regex, use this set-based version, which is 2000 times faster than a regex union.

Optimized Regex with Trie

A simple Regex union approach becomes slow with many banned words, because the regex engine doesn't do a very good job of optimizing the pattern.

It's possible to create a Trie with all the banned words and write the corresponding regex. The resulting trie or regex aren't really human-readable, but they do allow for very fast lookup and match.

Example

['foobar', 'foobah', 'fooxar', 'foozap', 'fooza']

The list is converted to a trie:

And then to this regex pattern:

r"\bfoo(?:ba[hr]|xar|zap?)\b"

The huge advantage is that to test if zoo matches, the regex engine only needs to compare the first character (it doesn't match), instead of trying the 5 words. It's a preprocess overkill for 5 words, but it shows promising results for many thousand words.

Note that (?:) non-capturing groups are used because:

foobar|baz would match foobar or baz, but not foobaz
foo(bar|baz) would save unneeded information to a capturing group.

Code

Here's a slightly modified gist, which we can use as a trie.py library:

import re


class Trie():
    """Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
    The corresponding Regex should match much faster than a simple Regex union."""

    def __init__(self):
        self.data = {}

    def add(self, word):
        ref = self.data
        for char in word:
            ref[char] = char in ref and ref[char] or {}
            ref = ref[char]
        ref[''] = 1

    def dump(self):
        return self.data

    def quote(self, char):
        return re.escape(char)

    def _pattern(self, pData):
        data = pData
        if "" in data and len(data.keys()) == 1:
            return None

        alt = []
        cc = []
        q = 0
        for char in sorted(data.keys()):
            if isinstance(data[char], dict):
                try:
                    recurse = self._pattern(data[char])
                    alt.append(self.quote(char) + recurse)
                except:
                    cc.append(self.quote(char))
            else:
                q = 1
        cconly = not len(alt) > 0

        if len(cc) > 0:
            if len(cc) == 1:
                alt.append(cc[0])
            else:
                alt.append('[' + ''.join(cc) + ']')

        if len(alt) == 1:
            result = alt[0]
        else:
            result = "(?:" + "|".join(alt) + ")"

        if q:
            if cconly:
                result += "?"
            else:
                result = "(?:%s)?" % result
        return result

    def pattern(self):
        return self._pattern(self.dump())

Test

Here's a small test (the same as this one):

# Encoding: utf-8
import re
import timeit
import random
from trie import Trie

with open('/usr/share/dict/american-english') as wordbook:
    banned_words = [word.strip().lower() for word in wordbook]
    random.shuffle(banned_words)

test_words = [
    ("Surely not a word", "#surely_NöTäWORD_so_regex_engine_can_return_fast"),
    ("First word", banned_words[0]),
    ("Last word", banned_words[-1]),
    ("Almost a word", "couldbeaword")
]

def trie_regex_from_words(words):
    trie = Trie()
    for word in words:
        trie.add(word)
    return re.compile(r"\b" + trie.pattern() + r"\b", re.IGNORECASE)

def find(word):
    def fun():
        return union.match(word)
    return fun

for exp in range(1, 6):
    print("\nTrieRegex of %d words" % 10**exp)
    union = trie_regex_from_words(banned_words[:10**exp])
    for description, test_word in test_words:
        time = timeit.timeit(find(test_word), number=1000) * 1000
        print("  %s : %.1fms" % (description, time))

It outputs:

TrieRegex of 10 words
  Surely not a word : 0.3ms
  First word : 0.4ms
  Last word : 0.5ms
  Almost a word : 0.5ms

TrieRegex of 100 words
  Surely not a word : 0.3ms
  First word : 0.5ms
  Last word : 0.9ms
  Almost a word : 0.6ms

TrieRegex of 1000 words
  Surely not a word : 0.3ms
  First word : 0.7ms
  Last word : 0.9ms
  Almost a word : 1.1ms

TrieRegex of 10000 words
  Surely not a word : 0.1ms
  First word : 1.0ms
  Last word : 1.2ms
  Almost a word : 1.2ms

TrieRegex of 100000 words
  Surely not a word : 0.3ms
  First word : 1.2ms
  Last word : 0.9ms
  Almost a word : 1.6ms

For info, the regex begins like this:

(?:a(?:(?:\'s|a(?:\'s|chen|liyah(?:\'s)?|r(?:dvark(?:(?:\'s|s))?|on))|b(?:\'s|a(?:c(?:us(?:(?:\'s|es))?|[ik])|ft|lone(?:(?:\'s|s))?|ndon(?:(?:ed|ing|ment(?:\'s)?|s))?|s(?:e(?:(?:ment(?:\'s)?|[ds]))?|h(?:(?:e[ds]|ing))?|ing)|t(?:e(?:(?:ment(?:\'s)?|[ds]))?|ing|toir(?:(?:\'s|s))?))|b(?:as(?:id)?|e(?:ss(?:(?:\'s|es))?|y(?:(?:\'s|s))?)|ot(?:(?:\'s|t(?:\'s)?|s))?|reviat(?:e[ds]?|i(?:ng|on(?:(?:\'s|s))?))|y(?:\'s)?|\é(?:(?:\'s|s))?)|d(?:icat(?:e[ds]?|i(?:ng|on(?:(?:\'s|s))?))|om(?:en(?:(?:\'s|s))?|inal)|u(?:ct(?:(?:ed|i(?:ng|on(?:(?:\'s|s))?)|or(?:(?:\'s|s))?|s))?|l(?:\'s)?))|e(?:(?:\'s|am|l(?:(?:\'s|ard|son(?:\'s)?))?|r(?:deen(?:\'s)?|nathy(?:\'s)?|ra(?:nt|tion(?:(?:\'s|s))?))|t(?:(?:t(?:e(?:r(?:(?:\'s|s))?|d)|ing|or(?:(?:\'s|s))?)|s))?|yance(?:\'s)?|d))?|hor(?:(?:r(?:e(?:n(?:ce(?:\'s)?|t)|d)|ing)|s))?|i(?:d(?:e[ds]?|ing|jan(?:\'s)?)|gail|l(?:ene|it(?:ies|y(?:\'s)?)))|j(?:ect(?:ly)?|ur(?:ation(?:(?:\'s|s))?|e[ds]?|ing))|l(?:a(?:tive(?:(?:\'s|s))?|ze)|e(?:(?:st|r))?|oom|ution(?:(?:\'s|s))?|y)|m\'s|n(?:e(?:gat(?:e[ds]?|i(?:ng|on(?:\'s)?))|r(?:\'s)?)|ormal(?:(?:it(?:ies|y(?:\'s)?)|ly))?)|o(?:ard|de(?:(?:\'s|s))?|li(?:sh(?:(?:e[ds]|ing))?|tion(?:(?:\'s|ist(?:(?:\'s|s))?))?)|mina(?:bl[ey]|t(?:e[ds]?|i(?:ng|on(?:(?:\'s|s))?)))|r(?:igin(?:al(?:(?:\'s|s))?|e(?:(?:\'s|s))?)|t(?:(?:ed|i(?:ng|on(?:(?:\'s|ist(?:(?:\'s|s))?|s))?|ve)|s))?)|u(?:nd(?:(?:ed|ing|s))?|t)|ve(?:(?:\'s|board))?)|r(?:a(?:cadabra(?:\'s)?|d(?:e[ds]?|ing)|ham(?:\'s)?|m(?:(?:\'s|s))?|si(?:on(?:(?:\'s|s))?|ve(?:(?:\'s|ly|ness(?:\'s)?|s))?))|east|idg(?:e(?:(?:ment(?:(?:\'s|s))?|[ds]))?|ing|ment(?:(?:\'s|s))?)|o(?:ad|gat(?:e[ds]?|i(?:ng|on(?:(?:\'s|s))?)))|upt(?:(?:e(?:st|r)|ly|ness(?:\'s)?))?)|s(?:alom|c(?:ess(?:(?:\'s|e[ds]|ing))?|issa(?:(?:\'s|[es]))?|ond(?:(?:ed|ing|s))?)|en(?:ce(?:(?:\'s|s))?|t(?:(?:e(?:e(?:(?:\'s|ism(?:\'s)?|s))?|d)|ing|ly|s))?)|inth(?:(?:\'s|e(?:\'s)?))?|o(?:l(?:ut(?:e(?:(?:\'s|ly|st?))?|i(?:on(?:\'s)?|sm(?:\'s)?))|v(?:e[ds]?|ing))|r(?:b(?:(?:e(?:n(?:cy(?:\'s)?|t(?:(?:\'s|s))?)|d)|ing|s))?|pti...

It's really unreadable, but for a list of 100000 banned words, this Trie regex is 1000 times faster than a simple regex union!

Here's a diagram of the complete trie, exported with trie-python-graphviz and graphviz twopi: