Merge branch 'master' of https://github.com/Microsoft/Simplify-Docx

Jason Thorpe · Jason Thorpe · commit 4606c41ed32e · 2021-07-10T11:17:44.000-07:00
diff --git a/README.md b/README.md
@@ -2,9 +2,9 @@
 
 DOCX files are complex, and their complexity makes scraping documents
 for their content difficult. The aim of this package is to simplify
-`.docx` files to just the components which carry meaning thereby easing the
-process of document identification and scraping by converting a `.docx`
-file into a predictable an *human readable* JSON file.
+`.docx` files to just the components which carry meaning, thereby easing the
+process of pattern matching and data extraction by converting a `.docx`
+file into a predictable and *human readable* JSON file.
 
 Simplifying a complex document down to it's *meaningful* parts of course
 requires taking a position on what does and does-not convey meaning in a
@@ -43,9 +43,13 @@ etc.), you'll need to clone [this fork](https://github.com/jdthorpe/python-docx)
 
 ### General
 
-* **"friendly-names"**: (*Default = `True`*): Use user-friendly type names
+* **"friendly-name"**: (*Default = `True`*): Use user-friendly type names
 	such as "table-cell", over standard element names like "CT_Tc"
 
+* **"merge-consecutive-text"**: (*Default = `True`*): Sentences and even single
+	words can be represented by multiple text elements. If `True`,
+	concatenate consecutive text elements into a single text element.
+
 ### Ignoring Invisible things
 
 * **"ignore-empty-paragraphs"**: (*Default = `True`*): Empty paragraphs are
@@ -147,9 +151,6 @@ often used to divide sections of a document into logical components.
 
 ### Special content
 
-* **"merge-consecutive-text"**: (*Default = `True`*): Sentences and even single
-	words can be represented by multiple text elements. If `True`,
-	concatenate consecutive text elements into a single text element.
 * **"flatten-hyperlink"**: (*Default = `True`*): Flatten hyperlinks, including
 	their contents in the flow of normal text.
 * **"flatten-smartTag"**: (*Default = `True`*): Flatten smartTag elements, 
diff --git a/src/simplify_docx/elements/run_contents.py b/src/simplify_docx/elements/run_contents.py
@@ -75,23 +75,23 @@ def to_json(
             )
             _value = _value.replace(u"\u201c", '"').replace(u"\u201d", '"')
 
-        if options.get("dumb-hyphens", True):
+        if options.get("dumb-spaces", True):
             _value = (
-                _value.replace(u"\u2000", "-")
-                .replace(u"\u2001", "-")
-                .replace(u"\u2002", "-")
-                .replace(u"\u2003", "-")
-                .replace(u"\u2004", "-")
-                .replace(u"\u2005", "-")
-                .replace(u"\u2006", "-")
-                .replace(u"\u2007", "-")
-                .replace(u"\u2008", "-")
-                .replace(u"\u2009", "-")
-                .replace(u"\u200A", "-")
-                .replace(u"\u201B", "-")
+                _value.replace(u"\u2000", " ")
+                .replace(u"\u2001", " ")
+                .replace(u"\u2002", " ")
+                .replace(u"\u2003", " ")
+                .replace(u"\u2004", " ")
+                .replace(u"\u2005", " ")
+                .replace(u"\u2006", " ")
+                .replace(u"\u2007", " ")
+                .replace(u"\u2008", " ")
+                .replace(u"\u2009", " ")
+                .replace(u"\u200A", " ")
+                .replace(u"\u201B", " ")
             )
 
-        if options.get("dumb-spaces", True):
+        if options.get("dumb-hyphens", True):
             _value = (
                 _value.replace(u"\u2010", "-")
                 .replace(u"\u2011", "-")
diff --git a/src/simplify_docx/utils/paragrapy_style.py b/src/simplify_docx/utils/paragrapy_style.py
@@ -6,7 +6,7 @@ def get_pStyle(p, doc):
     """
     Get the referenced style element for a paragraph with a p.pPr.pStyle
     """
-    if p.pPr is not None and \
+    if getattr(p, "pPr", None) is not None and \
             p.pPr.pStyle is not None:
         return doc.styles.element.find("w:style[@w:styleId='%s']" % p.pPr.pStyle.val,
                 doc.styles.element.nsmap)
@@ -17,7 +17,7 @@ def get_num_style(p, doc):
     """
     The the paragraph's Numbering style
     """
-    if p.pPr is not None \
+    if getattr(p, "pPr", None) is not None \
             and p.pPr.numPr is not None\
             and p.pPr.numPr.numId is not None:
         # the numbering style doc
@@ -47,7 +47,7 @@ def get_paragraph_ind(p, doc):
     * Direct Formatting
     """
 
-    if p.pPr is not None and\
+    if getattr(p, "pPr", None) is not None and\
             p.pPr.ind is not None:
         return p.pPr.ind