microsoft
diff --git a/‎.pylintrc‎
Lines changed: 564 additions & 0 deletions b/‎.pylintrc‎
Lines changed: 564 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 160 additions & 0 deletions b/‎README.md‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 30 additions & 0 deletions b/‎setup.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎simplify_docx/__init__.py‎
Lines changed: 84 additions & 0 deletions b/‎simplify_docx/__init__.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎simplify_docx/elements/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎simplify_docx/elements/__init__.py‎
Lines changed: 19 additions & 0 deletions
@@ -1,3 +1,163 @@
+# Overview
+
+DOCX files are complex, and their complexity makes scraping documents
+for their content difficult. The aim of this package is to simplify
+`.docx` files to just the components which carry meaning thereby easing the
+process of document identification and scraping by converting a `.docx`
+file into a predictable an *human readable* JSON file.
+
+Simplifying a complex document down to it's *meaningful* parts of course
+requires taking a position on what does and does-not convey meaning in a
+document. Generally, this package takes the stance that the document
+structure (body, paragraphs, tables, etc.) are meaningful as is the text
+itself, whereas text styling (font, font-weight, etc.) is ignored almost
+entirely, with the exception of paragraph indentation and numbering which
+is often used to create lists, block quotes, etc.  Furthermore, the
+opinions expressed by this package are explained in the Options section
+below and can be changed to suite your needs.
+
+# Usage
+```python
+import docx
+from simplify_docx import simplify
+
+# read in a document 
+my_doc = docx.Document("/path/to/my/favorite/file.docx")
+
+# coerce to JSON using the standard options
+my_doc_as_json = simplify(my_doc)
+
+# or with non-standard options
+my_doc_as_json = simplify(my_doc,{"remove-leading-white-space":False})
+```
+
+# Installation
+
+This project relies on the `python-docx` package which can be installed via
+`pip install python-docx`. **However**, as of this writing, if you wish to
+scrape documents which contain (A) form fields such as drop down lists,
+checkboxes and text inputs or (B) nested documents (subdocs, altChunks,
+etc.), you'll need to clone [this fork](https://github.com/jdthorpe/python-docx) of the python-docx package.
+
+# Options
+
+### General
+
+* **"friendly-names"**: (*Default = `True`*): Use user-friendly type names
+	such as "table-cell", over standard element names like "CT_Tc"
+
+### Ignoring Invisible things
+
+* **"ignore-empty-paragraphs"**: (*Default = `True`*): Empty paragraphs are
+	often used for styling purpose and rarely have significance in the
+	meaning of the document.
+* **"ignore-empty-text"**: (*Default = `True`*): Empty text runs can make an
+	otherwise empty paragraph appear to contain data.
+* **"remove-leading-white-space"**: (*Default = `True`*): Leading white-space
+	at the start of a paragraph is ocassionaly used for styling purposes
+	and rarely has significance in the interpretation of a document.
+* **"remove-trailing-white-space"**: (*Default = `True`*): Trailing white-space
+	at the end of a paragraph rarely has significance in the interpretation
+	of a document.
+* **"flatten-inner-spaces"**: (*Default = `False`*): Collapse multiple
+	space characters between words to a single space.
+* **"ignore-joiners"**: (*Default = `False`*): Zero width joiner and non-joiner 
+	characters are special characters used to create ligatures in displayed
+	text and don't typically convey meaning (at least in alphabet based
+	languages).
+
+### Special symbols
+
+* **"dumb-quotes"**: (*Default = `True`*): Replace smart quotes with
+	dumb quotes.
+* **"dumb-hyphens"**: (*Default = `True`*): Replace en-dash, em-dash,
+	figure-dash, horizontal bar, and non-breaking hyphens with ordinary hyphens.
+* **"dumb-spaces"**: (*Default = `True`*): Replace zero width spaces, hair 
+	spaces, thin spaces, punctuation spaces, figure spaces, six per em
+	spaces, four per em spaces, three per em spaces, em spaces, en spaces,
+	em quad spaces, and en quad spaces with ordinary spaces.
+* **"special-characters-as-text"**: (*Default = `True`*): Coerce special
+	characters into text equivalents according to the following table:
+
+| Character | Text Equivalent | 
+| --------- | --------------- | 
+| CarriageReturn | `\n` |
+| Break | `\r` |
+| TabChar | `\t` |
+| PositionalTab | `\t` |
+| NoBreakHyphen | `-` |
+| SoftHyphen | `-` |
+
+* **"symbol-as-text"**: (*Default = `True`*): Special symbols often cary
+	meaning other than the underlying unicode character, especially when
+	the font is a special font such as `Wingdings`. If `True` these are
+	included as ordinary text and their font information is omitted.
+* **"empty-as-text"**: (*Default = `False`*): There are a variety of "Empty"
+	tags such as the `<"w:yearLong">` tag which cause the current year to
+	be inserted into the document text. If `True`, include these as text
+	formatted as `"[yearLong]"`.
+* **"ignore-left-to-right-mark"**: (*Default = `False`*): Ignore the left-to-right
+	mark, which is not writeable by pythons csv writer.
+* **"ignore-right-to-left-mark"**: (*Default = `False`*): Ignore the right-to-left
+	mark which is not writeable by pythons csv writer.
+
+### Paragraph style:
+
+Paragraph style markup are one exception to the styling vs. content
+dichotomy. For example, block quotes are often indicated by indenting whole
+paragraphs, and Ordered lists, Unordered lists and nesting of lists is
+often used to divide sections of a document into logical components. 
+
+* **"include-paragraph-indent"**: (*Default = `True`*): Include the
+	indentation markup on paragraph (`CT_P`) elements. Indentation is
+	measured in twips
+* **"include-paragraph-numbering"**: (*Default = `True`*): Include the
+	numbering styles, which are included in the `CT_P.pPr.numPr` element.
+	The `ilvl` attribute indicates the level of nesting (zero based index)
+	and the `numId` attribute refers to a specific numbering style
+	included in the document's internal styles sheet. 
+
+### Form Elements
+
+* **"simplify-dropdown"**: (*Default = `True`*): Include just the selected
+	and default values, the available options, and the name and label attributes in the form element.
+* **"simplify-textinput"**: (*Default = `True`*): Include just the current
+	and default values, and the name and label attributes in the form element.
+* **"greedy-text-input"**: (*Default = `True`*): Continue consuming run
+	elements when the text-input has not ended at the end of a paragraph,
+	and the next block level element is also a paragraph. This typically
+	occurs when the user preses the return key while editing a text input
+	field.
+* **"simplify-checkbox"**: (*Default = `True`*): Include just the current
+	and default values, and the name and label attributes in the form element.
+* **"use-checkbox-default"**: (*Default = `True`*): If the checkbox has no
+	`value` attribute (typically because the user has not interacted with
+	it), report the default value as the checkbox value.
+* **"checkbox-as-text"**: (*Default = `False`*): Coerce the value of the
+	checkbox to text, represented as either `"[CheckBox:True]"` or `"[CheckBox:False]"`
+* **"dropdown-as-text"**: (*Default = `False`*): Coerce the value of the
+	checkbox to text, represented as `"[DropDown:<selected value>]"`
+* **"trim-dropdown-options"**: (*Default = `True`*): Remove white-space on
+	the left and right of drop down option items.
+* **"flatten-generic-field"**: (*Default = `True`*): `generic-fields` are
+	`CT_FldChar` runs which are not marked as a drop-down, text-input, or
+	checkbox. These may include special instructions which apply special
+	formatting to a text run (e.g. a hyper link). If `True`, the contents
+	of generic-fields are included in the normal flow of text
+
+### Special content
+
+* **"merge-consecutive-text"**: (*Default = `True`*): Sentences and even single
+	words can be represented by multiple text elements. If `True`,
+	concatenate consecutive text elements into a single text element.
+* **"flatten-hyperlink"**: (*Default = `True`*): Flatten hyperlinks, including
+	their contents in the flow of normal text.
+* **"flatten-smartTag"**: (*Default = `True`*): Flatten smartTag elements, 
+	including their contents in the flow of normal text.
+* **"flatten-customXml"**: (*Default = `True`*): Flatten customXml elements, 
+	including their contents in the flow of normal text.
+* **"flatten-simpleField"**: (*Default = `True`*): Flatten simpleField elements, 
+	including their contents in the flow of normal text.
 
 # Contributing
 
 
@@ -0,0 +1,30 @@
+"""
+Package installation via setup()
+"""
+import codecs
+import os
+import re
+from setuptools import setup
+
+#Allow single version in source file to be used here
+#From https://packaging.python.org/guides/single-sourcing-package-version/
+def read(*parts):
+    # intentionally *not* adding an encoding option to open
+    # see here: https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690
+    here = os.path.abspath(os.path.dirname(__file__))
+    return codecs.open(os.path.join(here, *parts), 'r').read()
+def find_version(*file_paths):
+    version_file = read(*file_paths)
+    version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                              version_file, re.M)
+    if version_match:
+        return version_match.group(1)
+    raise RuntimeError("Unable to find version string.")
+
+setup(name="simplify-docx",
+        version=find_version('simplify_docx', '__init__.py'),
+        description="A utility for simplifying python-docx document objects",
+        author="Microsoft Research",
+        packages=['simplify_docx'],
+        license='UNLICENSED',
+        install_requires=["python-docx"])
@@ -0,0 +1,84 @@
+"""
+Coerce Docx Documents to JSON
+
+Not thread safe! (but could be if build_iterators returned the built iterator
+definitions and passed them around...)
+"""
+
+from typing import Union, Dict, Optional, Type, Any
+from .types.fragment import documentPart
+from .utils.walk import walk
+from .utils.friendly_names import apply_friendly_names
+from .elements import document
+from .utils.set_options import set_options as __set_options__
+
+__version__ = "0.1.0"
+
+# --------------------------------------------------
+# Main API
+# --------------------------------------------------
+def simplify(doc: documentPart, options: Optional[Dict[str, Any]] = None):
+    """
+    Coerce Docx Documents to JSON
+    """
+
+    # SET OPTIONS
+    _options: Dict[str, Any]
+    if options:
+        _options = dict(__default_options__, **options)
+    else:
+        _options = __default_options__
+    __set_options__(_options)
+
+    out = document(doc.element).to_json(doc, _options)
+
+    if _options.get("friendly-name", True):
+        apply_friendly_names(out)
+
+    return out
+
+
+# --------------------------------------------------
+# Default Options
+# --------------------------------------------------
+__default_options__: Dict[str, Union[str, bool, int, float]] = {
+    # general
+    "friendly-names": True,
+    # flattening special content
+    "flatten-hyperlink": True,
+    "flatten-smartTag": True,
+    "flatten-customXml": True,
+    "flatten-simpleField": True,
+    "merge-consecutive-text": True,
+    "flatten-inner-spaces": False,
+    # possibly meaningful style:
+    "include-paragraph-indent": True,
+    "include-paragraph-numbering": True,
+    # ignoring invisible things
+    "ignore-joiners": True,
+    "ignore-left-to-right-mark": False,
+    "ignore-right-to-left-mark": False,
+    "ignore-empty-table-description": True,
+    "ignore-empty-table-caption": True,
+    "ignore-empty-paragraphs": True,
+    "ignore-empty-text": True,
+    "remove-trailing-white-space": True,
+    "remove-leading-white-space": True,
+    # forms
+    "use-checkbox-default": True,
+    "greedy-text-input": True,
+    "checkbox-as-text": False,
+    "dropdown-as-text": False,
+    "simplify-dropdown": True,
+    "simplify-textinput": True,
+    "simplify-checkbox": True,
+    "flatten-generic-field": True,
+    "trim-dropdown-options": True,
+    # special symbols
+    "empty-as-text": False,
+    "symbol-as-text": True,
+    "special-characters-as-text": True,
+    "dumb-quotes": True,
+    "dumb-hyphens": True,
+    "dumb-spaces": True,
+}
@@ -0,0 +1,19 @@
+"""
+Docx element objects
+"""
+# from .blocks import smartTag, customXml, fldSimple, hyperlink, paragraph_list, paragraph
+from .base import el, container, IncompatibleTypeError
+
+from .body import body
+from .document import document, altChunk, subDoc, contentPart
+from .table import table, tr, tc
+from .run_contents import text, simpleTextElement, SymbolChar, empty
+from .form import fldChar, checkBox, ddList, textInput, ffData
+from .paragraph import  (
+        EG_PContent,
+        paragraph,
+        hyperlink,
+        fldSimple,
+        customXml,
+        smartTag,
+)