From 58718c7dbd90817159e41fc9a201cd91f77a85f4 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 7 Apr 2015 17:52:53 +0800 Subject: [PATCH 1/3] Make normalize_whitespace faster A mostly-similar result can be achieved much faster. The only difference is that it doesn't care whether there's a line break in the string. --- breadability/utils.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/breadability/utils.py b/breadability/utils.py index 70a9778..767941d 100644 --- a/breadability/utils.py +++ b/breadability/utils.py @@ -18,9 +18,6 @@ def ignored(*exceptions): pass -MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE) - - def is_blank(text): """ Returns ``True`` if string contains only whitespace characters @@ -36,19 +33,8 @@ def shrink_text(text): def normalize_whitespace(text): """ Translates multiple whitespace into single space character. - If there is at least one new line character chunk is replaced - by single LF (Unix new line) character. """ - return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text) - - -def _replace_whitespace(match): - text = match.group() - - if "\n" in text or "\r" in text: - return "\n" - else: - return " " + return ' '.join(text.split()) def cached_property(getter): From 1cfa1090ae05da931655a044f7aec26180c392e0 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 7 Apr 2015 17:57:16 +0800 Subject: [PATCH 2/3] .strip() is useless before normalize_whitespace normalize_whitespace does the job as well --- breadability/scoring.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/breadability/scoring.py b/breadability/scoring.py index a042af2..84f0155 100644 --- a/breadability/scoring.py +++ b/breadability/scoring.py @@ -85,7 +85,7 @@ def get_link_density(node, node_text=None): """ if node_text is None: node_text = node.text_content() - node_text = normalize_whitespace(node_text.strip()) + node_text = normalize_whitespace(node_text) text_length = len(node_text) if text_length == 0: @@ -101,7 +101,7 @@ def get_link_density(node, node_text=None): def _get_normalized_text_length(node): - return len(normalize_whitespace(node.text_content().strip())) + return len(normalize_whitespace(node.text_content())) def get_class_weight(node): From d99b82134c46d0e3262f19f710085165d8e24c75 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 7 Apr 2015 17:58:28 +0800 Subject: [PATCH 3/3] shrink_text is the same as normalize_whitespace --- breadability/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/breadability/utils.py b/breadability/utils.py index 767941d..af4821d 100644 --- a/breadability/utils.py +++ b/breadability/utils.py @@ -26,10 +26,6 @@ def is_blank(text): return not text or text.isspace() -def shrink_text(text): - return normalize_whitespace(text.strip()) - - def normalize_whitespace(text): """ Translates multiple whitespace into single space character. @@ -37,6 +33,9 @@ def normalize_whitespace(text): return ' '.join(text.split()) +shrink_text = normalize_whitespace + + def cached_property(getter): """ Decorator that converts a method into memoized property.