Skip to content

Commit 49fa5ad

Browse files
committed
Refactoring APIs, add TreeNode
1 parent 634c870 commit 49fa5ad

12 files changed

Lines changed: 389 additions & 493 deletions

README.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -269,11 +269,12 @@ Cluster and generate URL Pattern:
269269
for url in urls:
270270
pattern_maker.load(url)
271271
272-
# dump pattern data
272+
# dump patterns
273273
formatter = PatternFormatter()
274-
for cluster in pattern_maker.make():
275-
for pattern in formatter.format(cluster):
276-
print(pattern)
274+
for maker in pattern_maker.makers:
275+
for url_meta, clusterd in maker.make():
276+
for pattern in formatter.format(url_meta, clusterd)
277+
print(pattern)
277278
278279
279280
Match URLs:

src/os_urlpattern/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.1.6
1+
0.1.7

src/os_urlpattern/cmdline.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ def _load(self, pattern_maker, args):
100100
speed_logger.debug('[LOADING]')
101101
try:
102102
url = url.decode(DEFAULT_ENCODING)
103-
if pattern_maker.load(url):
103+
_, is_new = pattern_maker.load(url)
104+
if is_new:
104105
stats['UNIQ'] += 1
105106
stats['VALID'] += 1
106107
except (InvalidPatternException,
@@ -120,13 +121,14 @@ def _load(self, pattern_maker, args):
120121
def _process(self, pattern_maker, args):
121122
formatter = FORMATTERS[args.formatter]()
122123
s = time.time()
123-
for pattern_tree in pattern_maker.make(combine=args.formatter == 'ETE'):
124-
e = time.time()
125-
self._logger.debug('[CLUSTER] %d %.2fs',
126-
pattern_tree.root.count, e - s)
127-
for record in formatter.format(pattern_tree):
128-
print(record)
129-
s = time.time()
124+
combine = args.formatter == 'ETE'
125+
for maker in pattern_maker.makers:
126+
for url_meta, root in maker.make(combine):
127+
e = time.time()
128+
self._logger.debug('[CLUSTER] %d %.2fs', root.count, e - s)
129+
for record in formatter.format(url_meta, root):
130+
print(record)
131+
s = time.time()
130132

131133
def _confirm_config(self, args):
132134
if args.formatter != 'CLUSTER':
@@ -172,7 +174,7 @@ def _load(self, pattern_matcher, args):
172174
continue
173175
try:
174176
pattern = pattern.decode(DEFAULT_ENCODING)
175-
pattern_matcher.load(pattern, pattern)
177+
pattern_matcher.load(pattern, meta=pattern)
176178
stats['VALID'] += 1
177179
except Exception as e:
178180
self._logger.warn("%s, %s", str(e), line)
@@ -191,7 +193,7 @@ def _match_result(self, pattern_matcher, raw_url, args):
191193
if not args.all_matched:
192194
sorted(result, reverse=True)
193195
result = result[:1]
194-
result = "\t".join([r.data for r in result]
196+
result = "\t".join([r.meta for r in result]
195197
).encode(DEFAULT_ENCODING)
196198
except (InvalidPatternException,
197199
IrregularURLException,

src/os_urlpattern/formatter.py

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,70 +3,83 @@
33
from .compat import StringIO
44
from .definition import BasePatternRule, Symbols
55
from .parse_utils import pack
6-
from .utils import get_ete_tree
6+
from .utils import dump_tree
77

88

99
class Formatter(object):
1010

11-
def format(self, tree, **kwargs):
11+
def format(self, url_meta, tree, **kwargs):
1212
pass
1313

1414

1515
class PatternFormatter(Formatter):
16-
def format(self, clusterd_tree, **kwargs):
17-
url_meta = clusterd_tree.url_meta
18-
for node_path in clusterd_tree.dump_paths():
19-
yield pack(url_meta, [p.pattern for p in node_path[1:]])
16+
def format(self, url_meta, root, **kwargs):
17+
for nodes in dump_tree(root):
18+
yield pack(url_meta, [p.pattern for p in nodes[1:]])
2019
break
2120

2221

2322
class ClusterFormatter(PatternFormatter):
24-
def format(self, clusterd_tree, **kwargs):
25-
for r in super(ClusterFormatter, self).format(clusterd_tree, **kwargs):
23+
def format(self, url_meta, root, **kwargs):
24+
for r in super(ClusterFormatter, self).format(url_meta, root, **kwargs):
2625
yield r
2726

28-
for node_path in clusterd_tree.dump_paths():
29-
for url in node_path[-1].extra_data:
27+
for nodes in dump_tree(root):
28+
for url in nodes[-1].meta:
3029
yield u'\t'.join((u'', url))
3130

3231

3332
class JsonFormatter(Formatter):
34-
def format(self, clusterd_tree, **kwargs):
35-
url_meta = clusterd_tree.url_meta
36-
for node_path in clusterd_tree.dump_paths():
37-
p = pack(url_meta, [p.pattern for p in node_path[1:]])
38-
yield json.dumps({'ptn': p, 'cnt': clusterd_tree.count})
33+
def format(self, url_meta, root, **kwargs):
34+
for nodes in dump_tree(root):
35+
p = pack(url_meta, [p.pattern for p in nodes[1:]])
36+
yield json.dumps({u'ptn': p, u'cnt': root.count})
3937
break
4038

4139

4240
class ETEFormatter(Formatter):
43-
def format(self, pattern_tree, **kwargs):
44-
url_meta = pattern_tree.url_meta
41+
def format(self, url_meta, root, **kwargs):
4542

4643
def f(pattern_node):
4744
sep = Symbols.EMPTY
4845
query_key = Symbols.EMPTY
49-
if url_meta.path_depth < pattern_node.current_level <= (url_meta.path_depth + url_meta.query_depth):
46+
path_depth = url_meta.path_depth
47+
query_depth = url_meta.query_depth
48+
current_level = pattern_node.level
49+
if path_depth < current_level \
50+
and current_level <= (path_depth + query_depth):
5051
sep = Symbols.AMPERSAND
51-
if pattern_node.current_level == url_meta.path_depth + 1:
52+
if current_level == path_depth + 1:
5253
sep = BasePatternRule.SINGLE_QUESTION
53-
query_key = url_meta.query_keys[pattern_node.current_level -
54-
url_meta.path_depth - 1]
55-
elif pattern_node.current_level == url_meta.path_depth + url_meta.query_depth + 1:
54+
query_key = url_meta.query_keys[current_level - path_depth - 1]
55+
elif current_level == path_depth + query_depth + 1:
5656
sep = Symbols.NUMBER
5757
return u' {sep}{query_key}{pattern_string}({count}) '.format(
5858
count=pattern_node.count,
59-
pattern_string=pattern_node,
59+
pattern_string=pattern_node.value,
6060
query_key=query_key,
6161
sep=sep)
6262

63-
if pattern_tree.root.count <= 0:
63+
if root.count <= 0:
6464
return
6565

66-
ete_tree = get_ete_tree(pattern_tree.root, format=f)
66+
ete_tree = get_ete_tree(root, format=f)
6767
yield ete_tree.get_ascii(show_internal=True)
6868

6969

70+
def get_ete_tree(root_node, format=str):
71+
from ete3 import Tree
72+
73+
def add_children(node, ete_node):
74+
for child in node.children:
75+
ete_child = ete_node.add_child(name=format(child))
76+
add_children(child, ete_child)
77+
78+
ete_root_node = Tree(name=format(root_node))
79+
add_children(root_node, ete_root_node)
80+
return ete_root_node
81+
82+
7083
FORMATTERS = {
7184
'PATTERN': PatternFormatter,
7285
'CLUSTER': ClusterFormatter,

src/os_urlpattern/pattern_cluster.py

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
from collections import Counter, OrderedDict, namedtuple
22

3-
from .compat import iteritems, itervalues
4-
from .parse_utils import URLMeta, digest, number_rule, wildcard_rule
5-
from .parsed_piece_view import (BaseView, LastDotSplitFuzzyView,
6-
MixedView)
3+
from .compat import itervalues
4+
from .parse_utils import (EMPTY_PARSED_PIECE, URLMeta, number_rule,
5+
wildcard_rule)
6+
from .parsed_piece_view import BaseView, LastDotSplitFuzzyView, MixedView
77
from .pattern import Pattern
8-
from .piece_pattern_tree import PiecePatternNode, PiecePatternTree
9-
from .utils import Bag
8+
from .piece_pattern_node import (PiecePatternNode, build_from_parsed_pieces,
9+
build_from_piece_pattern_nodes)
10+
from .utils import Bag, dump_tree
1011

1112

1213
class TBag(Bag):
@@ -112,7 +113,8 @@ def set_pattern(self, pattern):
112113
class ViewPieceBagBucket(PieceBagBucket):
113114
def __init__(self, url_meta):
114115
super(ViewPieceBagBucket, self).__init__()
115-
self._tree = PiecePatternTree(url_meta)
116+
self._url_meta = url_meta
117+
self._root = PiecePatternNode(EMPTY_PARSED_PIECE)
116118

117119
def add(self, view_piece_bag, build_tree=True):
118120
piece_bag = view_piece_bag.piece_bag
@@ -123,24 +125,23 @@ def add(self, view_piece_bag, build_tree=True):
123125
return
124126
view = view_piece_bag.view
125127

126-
self._tree.add_from_parsed_pieces(
127-
view.parsed_pieces,
128-
count=piece_bag.count,
129-
uniq=False)
128+
build_from_parsed_pieces(
129+
self._root, view.parsed_pieces, count=piece_bag.count, uniq=False)
130130

131131
def cluster(self, config, **kwargs):
132-
for clustered_tree in cluster(config, self._tree, **kwargs):
133-
yield self._transfer(clustered_tree)
132+
for clustered in cluster(config, self._url_meta, self._root, **kwargs):
133+
yield self._transfer(clustered)
134134

135-
def _transfer(self, clusterted_tree):
135+
def _transfer(self, root):
136136
pattern = None
137-
bucket = ViewPieceBagBucket(self._tree.url_meta)
138-
for path in clusterted_tree.dump_paths():
139-
piece = u''.join([p.piece for p in path[1:]])
137+
bucket = ViewPieceBagBucket(self._url_meta)
138+
for nodes in dump_tree(root):
139+
piece = u''.join([p.piece for p in nodes[1:]])
140140
view_piece_bag = self[piece]
141141
bucket.add(view_piece_bag, False)
142142
if pattern is None:
143-
pattern = Pattern(u''.join([str(p.pattern) for p in path[1:]]))
143+
pattern = Pattern(
144+
u''.join([str(p.pattern) for p in nodes[1:]]))
144145
return bucket, pattern
145146

146147

@@ -599,7 +600,7 @@ def _process(self):
599600
def add(self, node, add_children=False):
600601
c = self.get_cluster(PiecePatternCluster)
601602
if add_children:
602-
for child in node.iter_children():
603+
for child in node.children:
603604
c.add(child)
604605
else:
605606
c.add(node)
@@ -634,17 +635,16 @@ def _create_next_level_processors(self):
634635
processor.add(node, add_children=True)
635636

636637

637-
def split_by_pattern(piece_pattern_tree):
638-
url_meta = piece_pattern_tree.url_meta
639-
trees = {}
640-
for path in piece_pattern_tree.dump_paths():
641-
pid = digest(url_meta, [p.pattern for p in path[1:]])
642-
if pid not in trees:
643-
trees[pid] = PiecePatternTree(url_meta)
644-
tree = trees[pid]
645-
tree.add_from_piece_pattern_node_path(path[1:])
638+
def split_by_pattern(root):
639+
tree_roots = {}
640+
for nodes in dump_tree(root):
641+
pid = hash(u"/".join([str(p.pattern) for p in nodes]))
642+
if pid not in tree_roots:
643+
tree_roots[pid] = PiecePatternNode(EMPTY_PARSED_PIECE)
644+
sub_root = tree_roots[pid]
645+
build_from_piece_pattern_nodes(sub_root, nodes[1:])
646646

647-
return itervalues(trees)
647+
return itervalues(tree_roots)
648648

649649

650650
def _can_be_splited(processor):
@@ -662,19 +662,18 @@ def _can_be_splited(processor):
662662
return False
663663

664664

665-
def process(config, piece_pattern_tree, **kwargs):
666-
url_meta = piece_pattern_tree.url_meta
665+
def process(config, url_meta, root, **kwargs):
667666
meta_info = MetaInfo(url_meta, 0)
668667
processor = ClusterProcessor(config, meta_info, None, **kwargs)
669-
processor.add(piece_pattern_tree.root)
668+
processor.add(root)
670669
processor.process()
671670
return _can_be_splited(processor)
672671

673672

674-
def cluster(config, piece_pattern_tree, **kwargs):
675-
if not process(config, piece_pattern_tree, **kwargs):
676-
yield piece_pattern_tree
673+
def cluster(config, url_meta, root, **kwargs):
674+
if not process(config, url_meta, root, **kwargs):
675+
yield root
677676
return
678-
for sub_piece_pattern_tree in split_by_pattern(piece_pattern_tree):
679-
for tree in cluster(config, sub_piece_pattern_tree, **kwargs):
680-
yield tree
677+
for sub_root in split_by_pattern(root):
678+
for clustered in cluster(config, url_meta, sub_root, **kwargs):
679+
yield clustered

src/os_urlpattern/pattern_maker.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from .compat import itervalues
2-
from .parse_utils import PieceParser, digest, parse_url
2+
from .definition import BasePattern
3+
from .parse_utils import EMPTY_PARSED_PIECE, PieceParser, digest, parse_url
34
from .pattern_cluster import cluster
4-
from .pattern_tree import PatternTree
5-
from .piece_pattern_tree import PiecePatternTree
6-
from .utils import load_obj
5+
from .piece_pattern_node import PiecePatternNode, build_from_parsed_pieces
6+
from .utils import TreeNode, build_tree, dump_tree
77

88

99
class PatternMaker(object):
@@ -13,46 +13,46 @@ def __init__(self, config):
1313
self._makers = {}
1414
self._drop_url = self._config.getboolean('make', 'drop_url')
1515

16+
@property
17+
def makers(self):
18+
return itervalues(self._makers)
19+
1620
def load(self, url):
1721
url_meta, pieces = parse_url(url)
1822
parsed_pieces = [self._parser.parse(piece) for piece in pieces]
1923
sid = digest(url_meta, [p.fuzzy_rule for p in parsed_pieces])
2024
if sid not in self._makers:
2125
self._makers[sid] = Maker(self._config, url_meta)
2226
return self._makers[sid].load(parsed_pieces,
23-
data=url if not self._drop_url else None)
24-
25-
def make(self, combine=False):
26-
for maker in itervalues(self._makers):
27-
for tree in maker.make(combine):
28-
yield tree
27+
meta=url if not self._drop_url else None)
2928

3029

3130
class Maker(object):
3231
def __init__(self, config, url_meta):
3332
self._config = config
3433
self._url_meta = url_meta
35-
self._piece_pattern_tree = PiecePatternTree(url_meta)
34+
self._root = PiecePatternNode(EMPTY_PARSED_PIECE)
3635

37-
def load(self, parsed_pieces, count=1, uniq=True, data=None):
38-
return self._piece_pattern_tree.add_from_parsed_pieces(
39-
parsed_pieces, count=count, uniq=uniq, data=data)
36+
def load(self, parsed_pieces, count=1, meta=None, uniq=True):
37+
return build_from_parsed_pieces(self._root,
38+
parsed_pieces,
39+
count=count,
40+
meta=meta,
41+
uniq=uniq)
4042

41-
def _path_dump_and_load(self, src, dest, index=0):
42-
for path in src.dump_paths():
43-
if path:
44-
dest.load_path(path[index:])
43+
def _cluster(self):
44+
for clustered in cluster(self._config, self._url_meta, self._root):
45+
yield clustered
4546

46-
def cluster(self):
47-
for clusterd_tree in cluster(self._config, self._piece_pattern_tree):
48-
yield clusterd_tree
49-
50-
def make(self, combine):
47+
def make(self, combine=False):
5148
if combine:
52-
pattern_tree = PatternTree(self._url_meta)
53-
for clustered_tree in self.cluster():
54-
self._path_dump_and_load(clustered_tree, pattern_tree, 1)
55-
yield pattern_tree
49+
root = TreeNode(BasePattern.EMPTY)
50+
for clustered in self._cluster():
51+
for nodes in dump_tree(clustered):
52+
build_tree(root, [(n.pattern, n.pattern)
53+
for n in nodes[1:]], nodes[-1].count)
54+
55+
yield self._url_meta, root
5656
else:
57-
for clustered_tree in self.cluster():
58-
yield clustered_tree
57+
for clustered in self._cluster():
58+
yield self._url_meta, clustered

0 commit comments

Comments
 (0)