diff --git a/Lib/difflib.py b/Lib/difflib.py index 7c7e233b013a76..3a2f2ba0785ee3 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -32,16 +32,231 @@ from _colorize import can_colorize, get_theme from heapq import nlargest as _nlargest -from collections import namedtuple as _namedtuple +from collections import Counter as _Counter, namedtuple as _namedtuple from types import GenericAlias Match = _namedtuple('Match', 'a b size') + +def _adjust_indices(seq, start, stop): + if start < 0: + raise ValueError('Starting index can not be negative') + size = len(seq) + if stop is None or stop > size: + stop = size + return start, stop + + +class _LCSUBSimple: + """Simple dict method for finding longest common substring. + + Complexity: + T: O(n1 + n2) best, O(n1 × n2) worst + S: O(n2) + , where n1 = len(a), n2 = len(b) + + Members: + _b2j for x in b, b2j[x] is a list of the indices (into b) + at which x appears; junk elements do not appear + """ + + def __init__(self, b, junk=()): + if not isinstance(junk, frozenset): + junk = frozenset(junk) + self.b = b + self.junk = junk + self._b2j = None + + def isbuilt(self, blo, bhi): + blo, bhi = _adjust_indices(self.b, blo, bhi) + if blo >= bhi: + return True + return self._b2j is not None + + def _get_b2j(self): + b2j = self._b2j + if b2j is not None: + return b2j + + b2j = {} # positions of each element in b + for i, elt in enumerate(self.b): + indices = b2j.setdefault(elt, []) + indices.append(i) + junk = self.junk + if junk: + for elt in junk: + del b2j[elt] + self._b2j = b2j + return b2j + + def find(self, a, alo=0, ahi=None, blo=0, bhi=None): + alo, ahi = _adjust_indices(a, alo, ahi) + blo, bhi = _adjust_indices(self.b, blo, bhi) + if alo >= ahi or blo >= bhi: + return (alo, blo, 0) + + b2j = self._get_b2j() + j2len = {} + nothing = [] + besti, bestj, bestsize = alo, blo, 0 + # find longest junk-free match + # during an iteration of the loop, j2len[j] = length of longest + # junk-free match ending with a[i-1] and b[j] + for i in range(alo, ahi): + # look at all instances of a[i] in b; note that because + # b2j has no junk keys, the loop is skipped if a[i] is junk + j2lenget = j2len.get + newj2len = {} + for j in b2j.get(a[i], nothing): + # a[i] matches b[j] + if j < blo: + continue + if j >= bhi: + break + k = newj2len[j] = j2lenget(j-1, 0) + 1 + if k > bestsize: + besti = i - k + 1 + bestj = j - k + 1 + bestsize = k + j2len = newj2len + + return besti, bestj, bestsize + + +class _LCSUBAutomaton: + """Suffix Automaton for finding longest common substring. + + Complexity: + T: O(n1 + n2) - roughly 2 * n1 + 6 * n2 + S: O(n2) - maximum nodes: 2 * n2 + 1 + , where n1 = len(a), n2 = len(b) + + Node spec: + node: list = [length: int, link: list, next: dict, end_pos: int] + length - match length when the node is reached + link - reference to a node to fall back to + next - map to nodes to go to when matched + end_pos - end position of first occurrence (used for result) + """ + + def __init__(self, b, junk=()): + if not isinstance(junk, frozenset): + junk = frozenset(junk) + self.b = b + self.junk = junk + self._root = None + self._cache = (None, None) + + def isbuilt(self, blo, bhi): + blo, bhi = _adjust_indices(self.b, blo, bhi) + if blo >= bhi: + return True + return self._root is not None and self._cache == (blo, bhi) + + def _get_root(self, blo, bhi): + """ + Automaton needs to rebuild for every (start2, stop2) + It is made to cache the last one and only rebuilds for new range + """ + key = (blo, bhi) + root = self._root + if root is not None and self._cache == key: + return root + + LEN, LINK, NEXT, EPOS = 0, 1, 2, 3 + root = [0, None, {}, -1] + b = self.b + junk = self.junk + last_len = 0 + last = root + for j in range(blo, bhi): + c = b[j] + if c in junk: + last_len = 0 + last = root + else: + last_len += 1 + curr = [last_len, None, {}, j] + + p = last + p_next = p[NEXT] + while c not in p_next: + p_next[c] = curr + if p is root: + curr[LINK] = root + break + p = p[LINK] + p_next = p[NEXT] + else: + q = p_next[c] + p_len_p1 = p[LEN] + 1 + if p_len_p1 == q[LEN]: + curr[LINK] = q + else: + # Copy `q[EPOS]` to ensure leftmost match in b + clone = [p_len_p1, q[LINK], q[NEXT].copy(), q[EPOS]] + while (p_next := p[NEXT]).get(c) is q: + p_next[c] = clone + if p is root: + break + p = p[LINK] + + q[LINK] = curr[LINK] = clone + + last = curr + + self._root = root + self._cache = key + return root + + def find(self, a, alo=0, ahi=None, blo=0, bhi=None): + alo, ahi = _adjust_indices(a, alo, ahi) + blo, bhi = _adjust_indices(self.b, blo, bhi) + if alo >= ahi or blo >= bhi: + return (alo, blo, 0) + + LEN, LINK, NEXT, EPOS = 0, 1, 2, 3 + root = self._get_root(blo, bhi) + junk = self.junk + v = root + l = 0 + best_len = 0 + best_state = None + best_pos = 0 + + for i in range(alo, ahi): + c = a[i] + if c in junk: + v = root + l = 0 + else: + while v is not root and c not in v[NEXT]: + v = v[LINK] + l = v[LEN] + + v_next = v[NEXT] + if c in v_next: + v = v_next[c] + l += 1 + if l > best_len: + best_len = l + best_state = v + best_pos = i + + if not best_len: + return (alo, blo, 0) + + start_in_s1 = best_pos + 1 - best_len + start_in_s2 = best_state[EPOS] + 1 - best_len + return (start_in_s1, start_in_s2, best_len) + + def _calculate_ratio(matches, length): if length: return 2.0 * matches / length return 1.0 + class SequenceMatcher: """ @@ -276,32 +491,41 @@ def __chain_b(self): # out the junk later is much cheaper than building b2j "right" # from the start. b = self.b - self.b2j = b2j = {} - - for i, elt in enumerate(b): - indices = b2j.setdefault(elt, []) - indices.append(i) - - # Purge junk elements - self.bjunk = junk = set() isjunk = self.isjunk + self.bjunk = junk = set() + autojunk = self.autojunk + self.bpopular = popular = set() + self._bcounts = bcounts = dict(_Counter(b)) if isjunk: - for elt in b2j.keys(): - if isjunk(elt): - junk.add(elt) - for elt in junk: # separate loop avoids separate list of keys - del b2j[elt] + junk.update(filter(isjunk, bcounts)) + for elt in junk: + del bcounts[elt] - # Purge popular elements that are not junk - self.bpopular = popular = set() n = len(b) - if self.autojunk and n >= 200: + if autojunk and n >= 200: ntest = n // 100 + 1 - for elt, idxs in b2j.items(): - if len(idxs) > ntest: + for elt, num in bcounts.items(): + if num > ntest: popular.add(elt) for elt in popular: # ditto; as fast for 1% deletion - del b2j[elt] + del bcounts[elt] + + if not bcounts: + self._bcount_thres = 0 + else: + sum_bcount = sum(bcounts.values()) + avg_bcount = sum(c * c for c in bcounts.values()) / sum_bcount + max_bcount = max(bcounts.values()) + self._bcount_thres = avg_bcount * 0.8 + max_bcount * 0.2 + + self._all_junk = all_junk = frozenset(junk | popular) + self._lcsub_simple = _LCSUBSimple(b, all_junk) + self._lcsub_automaton = _LCSUBAutomaton(b, all_junk) + + @property + def b2j(self): + # NOTE: For backwards compatibility + return self._lcsub_simple._get_b2j() def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. @@ -361,32 +585,52 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): # Windiff ends up at the same place as diff, but by pairing up # the unique 'b's and then matching the first two 'a's. - a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ + a, b, isbjunk = self.a, self.b, self.bjunk.__contains__ if ahi is None: ahi = len(a) if bhi is None: bhi = len(b) - besti, bestj, bestsize = alo, blo, 0 - # find longest junk-free match - # during an iteration of the loop, j2len[j] = length of longest - # junk-free match ending with a[i-1] and b[j] - j2len = {} - nothing = [] - for i in range(alo, ahi): - # look at all instances of a[i] in b; note that because - # b2j has no junk keys, the loop is skipped if a[i] is junk - j2lenget = j2len.get - newj2len = {} - for j in b2j.get(a[i], nothing): - # a[i] matches b[j] - if j < blo: - continue - if j >= bhi: - break - k = newj2len[j] = j2lenget(j-1, 0) + 1 - if k > bestsize: - besti, bestj, bestsize = i-k+1, j-k+1, k - j2len = newj2len + asize = ahi - alo + bsize = bhi - blo + + if asize <= 0 and bsize <= 0: + besti, bestj, bestsize = alo, blo, 0 + else: + # Can trim a from both ends while characters are not in b + # This is cheap and we have bcounts at all times + bcounts = self._bcounts + tmp_alo = alo + tmp_ahi = ahi + while tmp_alo < tmp_ahi and a[tmp_alo] not in bcounts: + tmp_alo += 1 + while tmp_alo < tmp_ahi and a[tmp_ahi - 1] not in bcounts: + tmp_ahi -= 1 + tmp_asize = tmp_ahi - tmp_alo + if tmp_asize <= 0: + besti, bestj, bestsize = alo, blo, 0 + else: + # Constant to contruct automaton is roughly - 6. + # Constant to run automaton is roughly - 1. + # This has been tested on a range of data sets. + # It gave selection accuracy of ~95%. + # Weak spot is cases with little or no element overlap at all. + # However, such check would likely have more cost than benefit. + simple_calc = self._lcsub_simple + automaton = self._lcsub_automaton + + simple_cost = self._bcount_thres * tmp_asize + if not simple_calc.isbuilt(blo, bhi): + simple_cost += bsize + + automaton_cost = tmp_asize + if not automaton.isbuilt(blo, bhi): + automaton_cost += bsize * 6 + + if simple_cost < automaton_cost: + calc = simple_calc + else: + calc = automaton + besti, bestj, bestsize = calc.find(a, tmp_alo, tmp_ahi, blo, bhi) # Extend the best by non-junk elements on each end. In particular, # "popular" non-junk elements aren't in b2j, which greatly speeds diff --git a/Lib/test/test_pyclbr.py b/Lib/test/test_pyclbr.py index 79ef178f3807f4..f709cdb9522055 100644 --- a/Lib/test/test_pyclbr.py +++ b/Lib/test/test_pyclbr.py @@ -172,7 +172,7 @@ def test_easy(self): with temporary_main_spec(): self.checkModule('doctest', ignore=("TestResults", "_SpoofOut", "DocTestCase", '_DocTestSuite')) - self.checkModule('difflib', ignore=("Match",)) + self.checkModule('difflib', ignore=("Match", "b2j")) def test_cases(self): # see test.pyclbr_input for the rationale behind the ignored symbols