From b507f04301019234c50a2bf4b5be5c54deffc72a Mon Sep 17 00:00:00 2001 From: Yew Ming Chen Date: Wed, 13 Nov 2019 11:23:11 +0800 Subject: [PATCH 1/2] Support very large ctags file - Use multiprocessing to improve performance. - Reduce memory footprint with VimList to process a:items on the fly instead of loading the whole thing into memory --- autoload/pymatcher.py | 122 ++++++++++++++++++++++++++++-------------- 1 file changed, 82 insertions(+), 40 deletions(-) diff --git a/autoload/pymatcher.py b/autoload/pymatcher.py index a819dea..d9b9d9a 100644 --- a/autoload/pymatcher.py +++ b/autoload/pymatcher.py @@ -1,22 +1,82 @@ -import vim, re +import vim +import re import heapq +from multiprocessing import Pool +import os _escape = dict((c , "\\" + c) for c in ['^','$','.','{','}','(',')','[',']','\\','/','+']) +class FilenameScore: + def __init__(self, prog): + self.prog = prog + + def __call_(self, line): + # get filename via reverse find to improve performance + slashPos = line.rfind('/') + + if slashPos != -1: + line = line[slashPos + 1:] + + lineLower = line.casefold() + result = self.prog.search(lineLower) + if result: + score = result.end() - result.start() + 1 + score = score + ( len(lineLower) + 1 ) / 100.0 + score = score + ( len(line) + 1 ) / 1000.0 + return (1000.0 / score, line) + + return (0, line) + +class PathScore: + def __init__(self, prog, first_non_tab=False, until_last_tab=False): + self.prog = prog + self.first_non_tab = first_non_tab + self.until_last_tab = until_last_tab + + def __call__(self, line): + lineLower = line.casefold() + if self.first_non_tab: + lineLower = lineLower.split('\t')[0] + if self.until_last_tab: + lineLower = lineLower.rsplit('\t')[0] + result = self.prog.search(lineLower) + if result: + score = result.end() - result.start() + 1 + score = score + ( len(lineLower) + 1 ) / 100.0 + return (1000.0 / score, line) + + return (0, line) + +class VimList: + def __init__(self, name): + self.name = name + self.len = int(vim.eval('len({})'.format(self.name))) + + def __len__(self): + return self.len + + def __getitem__(self, index): + return vim.eval('{}[{}]'.format(self.name, index)) + + def __iter__(self): + for i in range(self.len): + yield self[i] + + def CtrlPPyMatch(): - items = vim.eval('a:items') + items = VimList('a:items') astr = vim.eval('a:str') - lowAstr = astr.lower() + lowAstr = astr.casefold() limit = int(vim.eval('a:limit')) mmode = vim.eval('a:mmode') aregex = int(vim.eval('a:regex')) crfile = vim.eval('a:crfile') - if crfile in items and int(vim.eval("pymatcher#ShouldHideCurrentFile(a:ispath, a:crfile)")): - items.remove(crfile) - rez = vim.eval('s:rez') + pool = Pool(max(1, os.cpu_count()-1)) + chunksize = 4096 + regex = '' if aregex == 1: regex = astr @@ -33,54 +93,36 @@ def CtrlPPyMatch(): regex += escaped[-1] # because this IGNORECASE flag is extremely expensive we are converting everything to lower case # see https://github.com/FelikZ/ctrlp-py-matcher/issues/29 - regex = regex.lower() + regex = regex.casefold() res = [] prog = re.compile(regex) - def filename_score(line): - # get filename via reverse find to improve performance - slashPos = line.rfind('/') - - if slashPos != -1: - line = line[slashPos + 1:] - - lineLower = line.lower() - result = prog.search(lineLower) - if result: - score = result.end() - result.start() + 1 - score = score + ( len(lineLower) + 1 ) / 100.0 - score = score + ( len(line) + 1 ) / 1000.0 - return 1000.0 / score - - return 0 - - def path_score(line): - lineLower = line.lower() - result = prog.search(lineLower) - if result: - score = result.end() - result.start() + 1 - score = score + ( len(lineLower) + 1 ) / 100.0 - return 1000.0 / score - - return 0 - if mmode == 'filename-only': - res = [(filename_score(line), line) for line in items] + filename_score = FilenameScore(prog) + res = pool.imap_unordered(filename_score, items, chunksize) elif mmode == 'first-non-tab': - res = [(path_score(line.split('\t')[0]), line) for line in items] + path_score = PathScore(prog, first_non_tab=True) + res = pool.imap_unordered(path_score, items, chunksize) elif mmode == 'until-last-tab': - res = [(path_score(line.rsplit('\t')[0]), line) for line in items] + path_score = PathScore(prog, until_last_tab=True) + res = pool.imap_unordered(path_score, items, chunksize) else: - res = [(path_score(line), line) for line in items] + path_score = PathScore(prog) + res = pool.imap_unordered(path_score, items, chunksize) + + pool.close() + + rez.extend((line for score, line in heapq.nlargest(limit, res) if score != 0)) - rez.extend([line for score, line in heapq.nlargest(limit, res) if score != 0]) + if int(vim.eval("pymatcher#ShouldHideCurrentFile(a:ispath, a:crfile)")) and crfile in rez: + rez.remove(crfile) # Use double quoted vim strings and escape \ - vimrez = ['"' + line.replace('\\', '\\\\').replace('"', '\\"') + '"' for line in rez] + vimrez = ('"' + line.replace('\\', '\\\\').replace('"', '\\"') + '"' for line in rez) vim.command("let s:regex = '%s'" % regex) vim.command('let s:rez = [%s]' % ','.join(vimrez)) From 0b18c449677fbce0dafd468ed766040eb24452ea Mon Sep 17 00:00:00 2001 From: Chen Yew Ming Date: Wed, 13 Nov 2019 13:21:05 +0800 Subject: [PATCH 2/2] Support Python2 and Windows --- autoload/pymatcher.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/autoload/pymatcher.py b/autoload/pymatcher.py index d9b9d9a..565911e 100644 --- a/autoload/pymatcher.py +++ b/autoload/pymatcher.py @@ -1,11 +1,16 @@ import vim import re import heapq -from multiprocessing import Pool +from multiprocessing import Pool, cpu_count import os _escape = dict((c , "\\" + c) for c in ['^','$','.','{','}','(',')','[',']','\\','/','+']) +if hasattr(str, 'casefold'): + str2lower = lambda s: s.casefold() +else: + str2lower = lambda s: s.lower() + class FilenameScore: def __init__(self, prog): self.prog = prog @@ -17,7 +22,7 @@ def __call_(self, line): if slashPos != -1: line = line[slashPos + 1:] - lineLower = line.casefold() + lineLower = str2lower(line) result = self.prog.search(lineLower) if result: score = result.end() - result.start() + 1 @@ -34,7 +39,7 @@ def __init__(self, prog, first_non_tab=False, until_last_tab=False): self.until_last_tab = until_last_tab def __call__(self, line): - lineLower = line.casefold() + lineLower = str2lower(line) if self.first_non_tab: lineLower = lineLower.split('\t')[0] if self.until_last_tab: @@ -66,7 +71,7 @@ def __iter__(self): def CtrlPPyMatch(): items = VimList('a:items') astr = vim.eval('a:str') - lowAstr = astr.casefold() + lowAstr = str2lower(astr) limit = int(vim.eval('a:limit')) mmode = vim.eval('a:mmode') aregex = int(vim.eval('a:regex')) @@ -74,8 +79,14 @@ def CtrlPPyMatch(): rez = vim.eval('s:rez') - pool = Pool(max(1, os.cpu_count()-1)) - chunksize = 4096 + pool = Pool(max(1, cpu_count()-1)) if os.name == 'posix' else None + + def pool_map(func, items): + chunksize = 4096 + if pool: + return pool.imap_unordered(func, items, chunksize) + else: + return (func(i) for i in items) regex = '' if aregex == 1: @@ -93,28 +104,29 @@ def CtrlPPyMatch(): regex += escaped[-1] # because this IGNORECASE flag is extremely expensive we are converting everything to lower case # see https://github.com/FelikZ/ctrlp-py-matcher/issues/29 - regex = regex.casefold() + regex = str2lower(regex) res = [] prog = re.compile(regex) if mmode == 'filename-only': filename_score = FilenameScore(prog) - res = pool.imap_unordered(filename_score, items, chunksize) + res = pool_map(filename_score, items) elif mmode == 'first-non-tab': path_score = PathScore(prog, first_non_tab=True) - res = pool.imap_unordered(path_score, items, chunksize) + res = pool_map(path_score, items) elif mmode == 'until-last-tab': path_score = PathScore(prog, until_last_tab=True) - res = pool.imap_unordered(path_score, items, chunksize) + res = pool_map(path_score, items) else: path_score = PathScore(prog) - res = pool.imap_unordered(path_score, items, chunksize) + res = pool_map(path_score, items) - pool.close() + if pool: + pool.close() rez.extend((line for score, line in heapq.nlargest(limit, res) if score != 0))