From effa3886af59f0ea7df40185106dbbef15cc1fe2 Mon Sep 17 00:00:00 2001 From: emmanuelknafo <48259636+emmanuelknafo@users.noreply.github.com> Date: Wed, 11 Mar 2026 23:56:55 -0400 Subject: [PATCH 1/2] Add validators for DOCX, PPTX, and redlining; implement Excel formula recalculation script - Introduced DOCXSchemaValidator for validating Word document XML files against XSD schemas, including checks for whitespace preservation, deletions, insertions, and ID constraints. - Added PPTXSchemaValidator for validating PowerPoint presentation XML files, ensuring proper UUIDs, slide layout IDs, and unique notes slide references. - Created RedliningValidator to check for tracked changes in Word documents, ensuring changes by a specified author are properly tracked. - Developed a recalc.py script to recalculate Excel formulas using LibreOffice, providing detailed error summaries for any issues found during the process. --- .github/skills/.gitkeep | 0 .github/skills/docx/LICENSE.txt | 30 + .github/skills/docx/SKILL.md | 590 +++ .github/skills/docx/scripts/__init__.py | 1 + .github/skills/docx/scripts/accept_changes.py | 135 + .github/skills/docx/scripts/comment.py | 318 ++ .../docx/scripts/office/helpers/__init__.py | 0 .../docx/scripts/office/helpers/merge_runs.py | 199 + .../office/helpers/simplify_redlines.py | 197 + .github/skills/docx/scripts/office/pack.py | 159 + .../schemas/ISO-IEC29500-4_2016/dml-chart.xsd | 1499 ++++++ .../ISO-IEC29500-4_2016/dml-chartDrawing.xsd | 146 + .../ISO-IEC29500-4_2016/dml-diagram.xsd | 1085 ++++ .../ISO-IEC29500-4_2016/dml-lockedCanvas.xsd | 11 + .../schemas/ISO-IEC29500-4_2016/dml-main.xsd | 3081 ++++++++++++ .../ISO-IEC29500-4_2016/dml-picture.xsd | 23 + .../dml-spreadsheetDrawing.xsd | 185 + .../dml-wordprocessingDrawing.xsd | 287 ++ .../schemas/ISO-IEC29500-4_2016/pml.xsd | 1676 +++++++ .../shared-additionalCharacteristics.xsd | 28 + .../shared-bibliography.xsd | 144 + .../shared-commonSimpleTypes.xsd | 174 + .../shared-customXmlDataProperties.xsd | 25 + .../shared-customXmlSchemaProperties.xsd | 18 + .../shared-documentPropertiesCustom.xsd | 59 + .../shared-documentPropertiesExtended.xsd | 56 + .../shared-documentPropertiesVariantTypes.xsd | 195 + .../ISO-IEC29500-4_2016/shared-math.xsd | 582 +++ .../shared-relationshipReference.xsd | 25 + .../schemas/ISO-IEC29500-4_2016/sml.xsd | 4439 +++++++++++++++++ .../schemas/ISO-IEC29500-4_2016/vml-main.xsd | 570 +++ .../ISO-IEC29500-4_2016/vml-officeDrawing.xsd | 509 ++ .../vml-presentationDrawing.xsd | 12 + .../vml-spreadsheetDrawing.xsd | 108 + .../vml-wordprocessingDrawing.xsd | 96 + .../schemas/ISO-IEC29500-4_2016/wml.xsd | 3646 ++++++++++++++ .../schemas/ISO-IEC29500-4_2016/xml.xsd | 116 + .../ecma/fouth-edition/opc-contentTypes.xsd | 42 + .../ecma/fouth-edition/opc-coreProperties.xsd | 50 + .../schemas/ecma/fouth-edition/opc-digSig.xsd | 49 + .../ecma/fouth-edition/opc-relationships.xsd | 33 + .../docx/scripts/office/schemas/mce/mc.xsd | 75 + .../office/schemas/microsoft/wml-2010.xsd | 560 +++ .../office/schemas/microsoft/wml-2012.xsd | 67 + .../office/schemas/microsoft/wml-2018.xsd | 14 + .../office/schemas/microsoft/wml-cex-2018.xsd | 20 + .../office/schemas/microsoft/wml-cid-2016.xsd | 13 + .../microsoft/wml-sdtdatahash-2020.xsd | 4 + .../schemas/microsoft/wml-symex-2015.xsd | 8 + .github/skills/docx/scripts/office/soffice.py | 183 + .github/skills/docx/scripts/office/unpack.py | 132 + .../skills/docx/scripts/office/validate.py | 111 + .../scripts/office/validators/__init__.py | 15 + .../docx/scripts/office/validators/base.py | 847 ++++ .../docx/scripts/office/validators/docx.py | 446 ++ .../docx/scripts/office/validators/pptx.py | 275 + .../scripts/office/validators/redlining.py | 247 + .../docx/scripts/templates/comments.xml | 3 + .../scripts/templates/commentsExtended.xml | 3 + .../scripts/templates/commentsExtensible.xml | 3 + .../docx/scripts/templates/commentsIds.xml | 3 + .../skills/docx/scripts/templates/people.xml | 3 + .github/skills/pdf/LICENSE.txt | 30 + .github/skills/pdf/SKILL.md | 314 ++ .github/skills/pdf/forms.md | 294 ++ .github/skills/pdf/reference.md | 612 +++ .../pdf/scripts/check_bounding_boxes.py | 65 + .../pdf/scripts/check_fillable_fields.py | 11 + .../pdf/scripts/convert_pdf_to_images.py | 33 + .../pdf/scripts/create_validation_image.py | 37 + .../pdf/scripts/extract_form_field_info.py | 122 + .../pdf/scripts/extract_form_structure.py | 115 + .../pdf/scripts/fill_fillable_fields.py | 98 + .../scripts/fill_pdf_form_with_annotations.py | 107 + .../skills/power-bi-dax-optimization/SKILL.md | 173 + .../power-bi-model-design-review/SKILL.md | 403 ++ .../SKILL.md | 382 ++ .../SKILL.md | 351 ++ .github/skills/powerbi-modeling/SKILL.md | 153 + .../references/MEASURES-DAX.md | 195 + .../references/PERFORMANCE.md | 215 + .../references/RELATIONSHIPS.md | 147 + .../skills/powerbi-modeling/references/RLS.md | 226 + .../references/STAR-SCHEMA.md | 103 + .github/skills/pptx/LICENSE.txt | 30 + .github/skills/pptx/SKILL.md | 232 + .github/skills/pptx/editing.md | 205 + .github/skills/pptx/pptxgenjs.md | 420 ++ .github/skills/pptx/scripts/__init__.py | 0 .github/skills/pptx/scripts/add_slide.py | 195 + .github/skills/pptx/scripts/clean.py | 286 ++ .../pptx/scripts/office/helpers/__init__.py | 0 .../pptx/scripts/office/helpers/merge_runs.py | 199 + .../office/helpers/simplify_redlines.py | 197 + .github/skills/pptx/scripts/office/pack.py | 159 + .../schemas/ISO-IEC29500-4_2016/dml-chart.xsd | 1499 ++++++ .../ISO-IEC29500-4_2016/dml-chartDrawing.xsd | 146 + .../ISO-IEC29500-4_2016/dml-diagram.xsd | 1085 ++++ .../ISO-IEC29500-4_2016/dml-lockedCanvas.xsd | 11 + .../schemas/ISO-IEC29500-4_2016/dml-main.xsd | 3081 ++++++++++++ .../ISO-IEC29500-4_2016/dml-picture.xsd | 23 + .../dml-spreadsheetDrawing.xsd | 185 + .../dml-wordprocessingDrawing.xsd | 287 ++ .../schemas/ISO-IEC29500-4_2016/pml.xsd | 1676 +++++++ .../shared-additionalCharacteristics.xsd | 28 + .../shared-bibliography.xsd | 144 + .../shared-commonSimpleTypes.xsd | 174 + .../shared-customXmlDataProperties.xsd | 25 + .../shared-customXmlSchemaProperties.xsd | 18 + .../shared-documentPropertiesCustom.xsd | 59 + .../shared-documentPropertiesExtended.xsd | 56 + .../shared-documentPropertiesVariantTypes.xsd | 195 + .../ISO-IEC29500-4_2016/shared-math.xsd | 582 +++ .../shared-relationshipReference.xsd | 25 + .../schemas/ISO-IEC29500-4_2016/sml.xsd | 4439 +++++++++++++++++ .../schemas/ISO-IEC29500-4_2016/vml-main.xsd | 570 +++ .../ISO-IEC29500-4_2016/vml-officeDrawing.xsd | 509 ++ .../vml-presentationDrawing.xsd | 12 + .../vml-spreadsheetDrawing.xsd | 108 + .../vml-wordprocessingDrawing.xsd | 96 + .../schemas/ISO-IEC29500-4_2016/wml.xsd | 3646 ++++++++++++++ .../schemas/ISO-IEC29500-4_2016/xml.xsd | 116 + .../ecma/fouth-edition/opc-contentTypes.xsd | 42 + .../ecma/fouth-edition/opc-coreProperties.xsd | 50 + .../schemas/ecma/fouth-edition/opc-digSig.xsd | 49 + .../ecma/fouth-edition/opc-relationships.xsd | 33 + .../pptx/scripts/office/schemas/mce/mc.xsd | 75 + .../office/schemas/microsoft/wml-2010.xsd | 560 +++ .../office/schemas/microsoft/wml-2012.xsd | 67 + .../office/schemas/microsoft/wml-2018.xsd | 14 + .../office/schemas/microsoft/wml-cex-2018.xsd | 20 + .../office/schemas/microsoft/wml-cid-2016.xsd | 13 + .../microsoft/wml-sdtdatahash-2020.xsd | 4 + .../schemas/microsoft/wml-symex-2015.xsd | 8 + .github/skills/pptx/scripts/office/soffice.py | 183 + .github/skills/pptx/scripts/office/unpack.py | 132 + .../skills/pptx/scripts/office/validate.py | 111 + .../scripts/office/validators/__init__.py | 15 + .../pptx/scripts/office/validators/base.py | 847 ++++ .../pptx/scripts/office/validators/docx.py | 446 ++ .../pptx/scripts/office/validators/pptx.py | 275 + .../scripts/office/validators/redlining.py | 247 + .github/skills/pptx/scripts/thumbnail.py | 289 ++ .github/skills/xlsx/LICENSE.txt | 30 + .github/skills/xlsx/SKILL.md | 292 ++ .../xlsx/scripts/office/helpers/__init__.py | 0 .../xlsx/scripts/office/helpers/merge_runs.py | 199 + .../office/helpers/simplify_redlines.py | 197 + .github/skills/xlsx/scripts/office/pack.py | 159 + .../schemas/ISO-IEC29500-4_2016/dml-chart.xsd | 1499 ++++++ .../ISO-IEC29500-4_2016/dml-chartDrawing.xsd | 146 + .../ISO-IEC29500-4_2016/dml-diagram.xsd | 1085 ++++ .../ISO-IEC29500-4_2016/dml-lockedCanvas.xsd | 11 + .../schemas/ISO-IEC29500-4_2016/dml-main.xsd | 3081 ++++++++++++ .../ISO-IEC29500-4_2016/dml-picture.xsd | 23 + .../dml-spreadsheetDrawing.xsd | 185 + .../dml-wordprocessingDrawing.xsd | 287 ++ .../schemas/ISO-IEC29500-4_2016/pml.xsd | 1676 +++++++ .../shared-additionalCharacteristics.xsd | 28 + .../shared-bibliography.xsd | 144 + .../shared-commonSimpleTypes.xsd | 174 + .../shared-customXmlDataProperties.xsd | 25 + .../shared-customXmlSchemaProperties.xsd | 18 + .../shared-documentPropertiesCustom.xsd | 59 + .../shared-documentPropertiesExtended.xsd | 56 + .../shared-documentPropertiesVariantTypes.xsd | 195 + .../ISO-IEC29500-4_2016/shared-math.xsd | 582 +++ .../shared-relationshipReference.xsd | 25 + .../schemas/ISO-IEC29500-4_2016/sml.xsd | 4439 +++++++++++++++++ .../schemas/ISO-IEC29500-4_2016/vml-main.xsd | 570 +++ .../ISO-IEC29500-4_2016/vml-officeDrawing.xsd | 509 ++ .../vml-presentationDrawing.xsd | 12 + .../vml-spreadsheetDrawing.xsd | 108 + .../vml-wordprocessingDrawing.xsd | 96 + .../schemas/ISO-IEC29500-4_2016/wml.xsd | 3646 ++++++++++++++ .../schemas/ISO-IEC29500-4_2016/xml.xsd | 116 + .../ecma/fouth-edition/opc-contentTypes.xsd | 42 + .../ecma/fouth-edition/opc-coreProperties.xsd | 50 + .../schemas/ecma/fouth-edition/opc-digSig.xsd | 49 + .../ecma/fouth-edition/opc-relationships.xsd | 33 + .../xlsx/scripts/office/schemas/mce/mc.xsd | 75 + .../office/schemas/microsoft/wml-2010.xsd | 560 +++ .../office/schemas/microsoft/wml-2012.xsd | 67 + .../office/schemas/microsoft/wml-2018.xsd | 14 + .../office/schemas/microsoft/wml-cex-2018.xsd | 20 + .../office/schemas/microsoft/wml-cid-2016.xsd | 13 + .../microsoft/wml-sdtdatahash-2020.xsd | 4 + .../schemas/microsoft/wml-symex-2015.xsd | 8 + .github/skills/xlsx/scripts/office/soffice.py | 183 + .github/skills/xlsx/scripts/office/unpack.py | 132 + .../skills/xlsx/scripts/office/validate.py | 111 + .../scripts/office/validators/__init__.py | 15 + .../xlsx/scripts/office/validators/base.py | 847 ++++ .../xlsx/scripts/office/validators/docx.py | 446 ++ .../xlsx/scripts/office/validators/pptx.py | 275 + .../scripts/office/validators/redlining.py | 247 + .github/skills/xlsx/scripts/recalc.py | 184 + 197 files changed, 75061 insertions(+) create mode 100644 .github/skills/.gitkeep create mode 100644 .github/skills/docx/LICENSE.txt create mode 100644 .github/skills/docx/SKILL.md create mode 100644 .github/skills/docx/scripts/__init__.py create mode 100644 .github/skills/docx/scripts/accept_changes.py create mode 100644 .github/skills/docx/scripts/comment.py create mode 100644 .github/skills/docx/scripts/office/helpers/__init__.py create mode 100644 .github/skills/docx/scripts/office/helpers/merge_runs.py create mode 100644 .github/skills/docx/scripts/office/helpers/simplify_redlines.py create mode 100644 .github/skills/docx/scripts/office/pack.py create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/mce/mc.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd create mode 100644 .github/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd create mode 100644 .github/skills/docx/scripts/office/soffice.py create mode 100644 .github/skills/docx/scripts/office/unpack.py create mode 100644 .github/skills/docx/scripts/office/validate.py create mode 100644 .github/skills/docx/scripts/office/validators/__init__.py create mode 100644 .github/skills/docx/scripts/office/validators/base.py create mode 100644 .github/skills/docx/scripts/office/validators/docx.py create mode 100644 .github/skills/docx/scripts/office/validators/pptx.py create mode 100644 .github/skills/docx/scripts/office/validators/redlining.py create mode 100644 .github/skills/docx/scripts/templates/comments.xml create mode 100644 .github/skills/docx/scripts/templates/commentsExtended.xml create mode 100644 .github/skills/docx/scripts/templates/commentsExtensible.xml create mode 100644 .github/skills/docx/scripts/templates/commentsIds.xml create mode 100644 .github/skills/docx/scripts/templates/people.xml create mode 100644 .github/skills/pdf/LICENSE.txt create mode 100644 .github/skills/pdf/SKILL.md create mode 100644 .github/skills/pdf/forms.md create mode 100644 .github/skills/pdf/reference.md create mode 100644 .github/skills/pdf/scripts/check_bounding_boxes.py create mode 100644 .github/skills/pdf/scripts/check_fillable_fields.py create mode 100644 .github/skills/pdf/scripts/convert_pdf_to_images.py create mode 100644 .github/skills/pdf/scripts/create_validation_image.py create mode 100644 .github/skills/pdf/scripts/extract_form_field_info.py create mode 100644 .github/skills/pdf/scripts/extract_form_structure.py create mode 100644 .github/skills/pdf/scripts/fill_fillable_fields.py create mode 100644 .github/skills/pdf/scripts/fill_pdf_form_with_annotations.py create mode 100644 .github/skills/power-bi-dax-optimization/SKILL.md create mode 100644 .github/skills/power-bi-model-design-review/SKILL.md create mode 100644 .github/skills/power-bi-performance-troubleshooting/SKILL.md create mode 100644 .github/skills/power-bi-report-design-consultation/SKILL.md create mode 100644 .github/skills/powerbi-modeling/SKILL.md create mode 100644 .github/skills/powerbi-modeling/references/MEASURES-DAX.md create mode 100644 .github/skills/powerbi-modeling/references/PERFORMANCE.md create mode 100644 .github/skills/powerbi-modeling/references/RELATIONSHIPS.md create mode 100644 .github/skills/powerbi-modeling/references/RLS.md create mode 100644 .github/skills/powerbi-modeling/references/STAR-SCHEMA.md create mode 100644 .github/skills/pptx/LICENSE.txt create mode 100644 .github/skills/pptx/SKILL.md create mode 100644 .github/skills/pptx/editing.md create mode 100644 .github/skills/pptx/pptxgenjs.md create mode 100644 .github/skills/pptx/scripts/__init__.py create mode 100644 .github/skills/pptx/scripts/add_slide.py create mode 100644 .github/skills/pptx/scripts/clean.py create mode 100644 .github/skills/pptx/scripts/office/helpers/__init__.py create mode 100644 .github/skills/pptx/scripts/office/helpers/merge_runs.py create mode 100644 .github/skills/pptx/scripts/office/helpers/simplify_redlines.py create mode 100644 .github/skills/pptx/scripts/office/pack.py create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/mce/mc.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd create mode 100644 .github/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd create mode 100644 .github/skills/pptx/scripts/office/soffice.py create mode 100644 .github/skills/pptx/scripts/office/unpack.py create mode 100644 .github/skills/pptx/scripts/office/validate.py create mode 100644 .github/skills/pptx/scripts/office/validators/__init__.py create mode 100644 .github/skills/pptx/scripts/office/validators/base.py create mode 100644 .github/skills/pptx/scripts/office/validators/docx.py create mode 100644 .github/skills/pptx/scripts/office/validators/pptx.py create mode 100644 .github/skills/pptx/scripts/office/validators/redlining.py create mode 100644 .github/skills/pptx/scripts/thumbnail.py create mode 100644 .github/skills/xlsx/LICENSE.txt create mode 100644 .github/skills/xlsx/SKILL.md create mode 100644 .github/skills/xlsx/scripts/office/helpers/__init__.py create mode 100644 .github/skills/xlsx/scripts/office/helpers/merge_runs.py create mode 100644 .github/skills/xlsx/scripts/office/helpers/simplify_redlines.py create mode 100644 .github/skills/xlsx/scripts/office/pack.py create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/mce/mc.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd create mode 100644 .github/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd create mode 100644 .github/skills/xlsx/scripts/office/soffice.py create mode 100644 .github/skills/xlsx/scripts/office/unpack.py create mode 100644 .github/skills/xlsx/scripts/office/validate.py create mode 100644 .github/skills/xlsx/scripts/office/validators/__init__.py create mode 100644 .github/skills/xlsx/scripts/office/validators/base.py create mode 100644 .github/skills/xlsx/scripts/office/validators/docx.py create mode 100644 .github/skills/xlsx/scripts/office/validators/pptx.py create mode 100644 .github/skills/xlsx/scripts/office/validators/redlining.py create mode 100644 .github/skills/xlsx/scripts/recalc.py diff --git a/.github/skills/.gitkeep b/.github/skills/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/.github/skills/docx/LICENSE.txt b/.github/skills/docx/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/.github/skills/docx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/.github/skills/docx/SKILL.md b/.github/skills/docx/SKILL.md new file mode 100644 index 0000000..2951e55 --- /dev/null +++ b/.github/skills/docx/SKILL.md @@ -0,0 +1,590 @@ +--- +name: docx +description: "Use this skill whenever the user wants to create, read, edit, or manipulate Word documents (.docx files). Triggers include: any mention of 'Word doc', 'word document', '.docx', or requests to produce professional documents with formatting like tables of contents, headings, page numbers, or letterheads. Also use when extracting or reorganizing content from .docx files, inserting or replacing images in documents, performing find-and-replace in Word files, working with tracked changes or comments, or converting content into a polished Word document. If the user asks for a 'report', 'memo', 'letter', 'template', or similar deliverable as a Word or .docx file, use this skill. Do NOT use for PDFs, spreadsheets, Google Docs, or general coding tasks unrelated to document generation." +license: Proprietary. LICENSE.txt has complete terms +--- + +# DOCX creation, editing, and analysis + +## Overview + +A .docx file is a ZIP archive containing XML files. + +## Quick Reference + +| Task | Approach | +|------|----------| +| Read/analyze content | `pandoc` or unpack for raw XML | +| Create new document | Use `docx-js` - see Creating New Documents below | +| Edit existing document | Unpack → edit XML → repack - see Editing Existing Documents below | + +### Converting .doc to .docx + +Legacy `.doc` files must be converted before editing: + +```bash +python scripts/office/soffice.py --headless --convert-to docx document.doc +``` + +### Reading Content + +```bash +# Text extraction with tracked changes +pandoc --track-changes=all document.docx -o output.md + +# Raw XML access +python scripts/office/unpack.py document.docx unpacked/ +``` + +### Converting to Images + +```bash +python scripts/office/soffice.py --headless --convert-to pdf document.docx +pdftoppm -jpeg -r 150 document.pdf page +``` + +### Accepting Tracked Changes + +To produce a clean document with all tracked changes accepted (requires LibreOffice): + +```bash +python scripts/accept_changes.py input.docx output.docx +``` + +--- + +## Creating New Documents + +Generate .docx files with JavaScript, then validate. Install: `npm install -g docx` + +### Setup +```javascript +const { Document, Packer, Paragraph, TextRun, Table, TableRow, TableCell, ImageRun, + Header, Footer, AlignmentType, PageOrientation, LevelFormat, ExternalHyperlink, + InternalHyperlink, Bookmark, FootnoteReferenceRun, PositionalTab, + PositionalTabAlignment, PositionalTabRelativeTo, PositionalTabLeader, + TabStopType, TabStopPosition, Column, SectionType, + TableOfContents, HeadingLevel, BorderStyle, WidthType, ShadingType, + VerticalAlign, PageNumber, PageBreak } = require('docx'); + +const doc = new Document({ sections: [{ children: [/* content */] }] }); +Packer.toBuffer(doc).then(buffer => fs.writeFileSync("doc.docx", buffer)); +``` + +### Validation +After creating the file, validate it. If validation fails, unpack, fix the XML, and repack. +```bash +python scripts/office/validate.py doc.docx +``` + +### Page Size + +```javascript +// CRITICAL: docx-js defaults to A4, not US Letter +// Always set page size explicitly for consistent results +sections: [{ + properties: { + page: { + size: { + width: 12240, // 8.5 inches in DXA + height: 15840 // 11 inches in DXA + }, + margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 } // 1 inch margins + } + }, + children: [/* content */] +}] +``` + +**Common page sizes (DXA units, 1440 DXA = 1 inch):** + +| Paper | Width | Height | Content Width (1" margins) | +|-------|-------|--------|---------------------------| +| US Letter | 12,240 | 15,840 | 9,360 | +| A4 (default) | 11,906 | 16,838 | 9,026 | + +**Landscape orientation:** docx-js swaps width/height internally, so pass portrait dimensions and let it handle the swap: +```javascript +size: { + width: 12240, // Pass SHORT edge as width + height: 15840, // Pass LONG edge as height + orientation: PageOrientation.LANDSCAPE // docx-js swaps them in the XML +}, +// Content width = 15840 - left margin - right margin (uses the long edge) +``` + +### Styles (Override Built-in Headings) + +Use Arial as the default font (universally supported). Keep titles black for readability. + +```javascript +const doc = new Document({ + styles: { + default: { document: { run: { font: "Arial", size: 24 } } }, // 12pt default + paragraphStyles: [ + // IMPORTANT: Use exact IDs to override built-in styles + { id: "Heading1", name: "Heading 1", basedOn: "Normal", next: "Normal", quickFormat: true, + run: { size: 32, bold: true, font: "Arial" }, + paragraph: { spacing: { before: 240, after: 240 }, outlineLevel: 0 } }, // outlineLevel required for TOC + { id: "Heading2", name: "Heading 2", basedOn: "Normal", next: "Normal", quickFormat: true, + run: { size: 28, bold: true, font: "Arial" }, + paragraph: { spacing: { before: 180, after: 180 }, outlineLevel: 1 } }, + ] + }, + sections: [{ + children: [ + new Paragraph({ heading: HeadingLevel.HEADING_1, children: [new TextRun("Title")] }), + ] + }] +}); +``` + +### Lists (NEVER use unicode bullets) + +```javascript +// ❌ WRONG - never manually insert bullet characters +new Paragraph({ children: [new TextRun("• Item")] }) // BAD +new Paragraph({ children: [new TextRun("\u2022 Item")] }) // BAD + +// ✅ CORRECT - use numbering config with LevelFormat.BULLET +const doc = new Document({ + numbering: { + config: [ + { reference: "bullets", + levels: [{ level: 0, format: LevelFormat.BULLET, text: "•", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] }, + { reference: "numbers", + levels: [{ level: 0, format: LevelFormat.DECIMAL, text: "%1.", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] }, + ] + }, + sections: [{ + children: [ + new Paragraph({ numbering: { reference: "bullets", level: 0 }, + children: [new TextRun("Bullet item")] }), + new Paragraph({ numbering: { reference: "numbers", level: 0 }, + children: [new TextRun("Numbered item")] }), + ] + }] +}); + +// ⚠️ Each reference creates INDEPENDENT numbering +// Same reference = continues (1,2,3 then 4,5,6) +// Different reference = restarts (1,2,3 then 1,2,3) +``` + +### Tables + +**CRITICAL: Tables need dual widths** - set both `columnWidths` on the table AND `width` on each cell. Without both, tables render incorrectly on some platforms. + +```javascript +// CRITICAL: Always set table width for consistent rendering +// CRITICAL: Use ShadingType.CLEAR (not SOLID) to prevent black backgrounds +const border = { style: BorderStyle.SINGLE, size: 1, color: "CCCCCC" }; +const borders = { top: border, bottom: border, left: border, right: border }; + +new Table({ + width: { size: 9360, type: WidthType.DXA }, // Always use DXA (percentages break in Google Docs) + columnWidths: [4680, 4680], // Must sum to table width (DXA: 1440 = 1 inch) + rows: [ + new TableRow({ + children: [ + new TableCell({ + borders, + width: { size: 4680, type: WidthType.DXA }, // Also set on each cell + shading: { fill: "D5E8F0", type: ShadingType.CLEAR }, // CLEAR not SOLID + margins: { top: 80, bottom: 80, left: 120, right: 120 }, // Cell padding (internal, not added to width) + children: [new Paragraph({ children: [new TextRun("Cell")] })] + }) + ] + }) + ] +}) +``` + +**Table width calculation:** + +Always use `WidthType.DXA` — `WidthType.PERCENTAGE` breaks in Google Docs. + +```javascript +// Table width = sum of columnWidths = content width +// US Letter with 1" margins: 12240 - 2880 = 9360 DXA +width: { size: 9360, type: WidthType.DXA }, +columnWidths: [7000, 2360] // Must sum to table width +``` + +**Width rules:** +- **Always use `WidthType.DXA`** — never `WidthType.PERCENTAGE` (incompatible with Google Docs) +- Table width must equal the sum of `columnWidths` +- Cell `width` must match corresponding `columnWidth` +- Cell `margins` are internal padding - they reduce content area, not add to cell width +- For full-width tables: use content width (page width minus left and right margins) + +### Images + +```javascript +// CRITICAL: type parameter is REQUIRED +new Paragraph({ + children: [new ImageRun({ + type: "png", // Required: png, jpg, jpeg, gif, bmp, svg + data: fs.readFileSync("image.png"), + transformation: { width: 200, height: 150 }, + altText: { title: "Title", description: "Desc", name: "Name" } // All three required + })] +}) +``` + +### Page Breaks + +```javascript +// CRITICAL: PageBreak must be inside a Paragraph +new Paragraph({ children: [new PageBreak()] }) + +// Or use pageBreakBefore +new Paragraph({ pageBreakBefore: true, children: [new TextRun("New page")] }) +``` + +### Hyperlinks + +```javascript +// External link +new Paragraph({ + children: [new ExternalHyperlink({ + children: [new TextRun({ text: "Click here", style: "Hyperlink" })], + link: "https://example.com", + })] +}) + +// Internal link (bookmark + reference) +// 1. Create bookmark at destination +new Paragraph({ heading: HeadingLevel.HEADING_1, children: [ + new Bookmark({ id: "chapter1", children: [new TextRun("Chapter 1")] }), +]}) +// 2. Link to it +new Paragraph({ children: [new InternalHyperlink({ + children: [new TextRun({ text: "See Chapter 1", style: "Hyperlink" })], + anchor: "chapter1", +})]}) +``` + +### Footnotes + +```javascript +const doc = new Document({ + footnotes: { + 1: { children: [new Paragraph("Source: Annual Report 2024")] }, + 2: { children: [new Paragraph("See appendix for methodology")] }, + }, + sections: [{ + children: [new Paragraph({ + children: [ + new TextRun("Revenue grew 15%"), + new FootnoteReferenceRun(1), + new TextRun(" using adjusted metrics"), + new FootnoteReferenceRun(2), + ], + })] + }] +}); +``` + +### Tab Stops + +```javascript +// Right-align text on same line (e.g., date opposite a title) +new Paragraph({ + children: [ + new TextRun("Company Name"), + new TextRun("\tJanuary 2025"), + ], + tabStops: [{ type: TabStopType.RIGHT, position: TabStopPosition.MAX }], +}) + +// Dot leader (e.g., TOC-style) +new Paragraph({ + children: [ + new TextRun("Introduction"), + new TextRun({ children: [ + new PositionalTab({ + alignment: PositionalTabAlignment.RIGHT, + relativeTo: PositionalTabRelativeTo.MARGIN, + leader: PositionalTabLeader.DOT, + }), + "3", + ]}), + ], +}) +``` + +### Multi-Column Layouts + +```javascript +// Equal-width columns +sections: [{ + properties: { + column: { + count: 2, // number of columns + space: 720, // gap between columns in DXA (720 = 0.5 inch) + equalWidth: true, + separate: true, // vertical line between columns + }, + }, + children: [/* content flows naturally across columns */] +}] + +// Custom-width columns (equalWidth must be false) +sections: [{ + properties: { + column: { + equalWidth: false, + children: [ + new Column({ width: 5400, space: 720 }), + new Column({ width: 3240 }), + ], + }, + }, + children: [/* content */] +}] +``` + +Force a column break with a new section using `type: SectionType.NEXT_COLUMN`. + +### Table of Contents + +```javascript +// CRITICAL: Headings must use HeadingLevel ONLY - no custom styles +new TableOfContents("Table of Contents", { hyperlink: true, headingStyleRange: "1-3" }) +``` + +### Headers/Footers + +```javascript +sections: [{ + properties: { + page: { margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 } } // 1440 = 1 inch + }, + headers: { + default: new Header({ children: [new Paragraph({ children: [new TextRun("Header")] })] }) + }, + footers: { + default: new Footer({ children: [new Paragraph({ + children: [new TextRun("Page "), new TextRun({ children: [PageNumber.CURRENT] })] + })] }) + }, + children: [/* content */] +}] +``` + +### Critical Rules for docx-js + +- **Set page size explicitly** - docx-js defaults to A4; use US Letter (12240 x 15840 DXA) for US documents +- **Landscape: pass portrait dimensions** - docx-js swaps width/height internally; pass short edge as `width`, long edge as `height`, and set `orientation: PageOrientation.LANDSCAPE` +- **Never use `\n`** - use separate Paragraph elements +- **Never use unicode bullets** - use `LevelFormat.BULLET` with numbering config +- **PageBreak must be in Paragraph** - standalone creates invalid XML +- **ImageRun requires `type`** - always specify png/jpg/etc +- **Always set table `width` with DXA** - never use `WidthType.PERCENTAGE` (breaks in Google Docs) +- **Tables need dual widths** - `columnWidths` array AND cell `width`, both must match +- **Table width = sum of columnWidths** - for DXA, ensure they add up exactly +- **Always add cell margins** - use `margins: { top: 80, bottom: 80, left: 120, right: 120 }` for readable padding +- **Use `ShadingType.CLEAR`** - never SOLID for table shading +- **Never use tables as dividers/rules** - cells have minimum height and render as empty boxes (including in headers/footers); use `border: { bottom: { style: BorderStyle.SINGLE, size: 6, color: "2E75B6", space: 1 } }` on a Paragraph instead. For two-column footers, use tab stops (see Tab Stops section), not tables +- **TOC requires HeadingLevel only** - no custom styles on heading paragraphs +- **Override built-in styles** - use exact IDs: "Heading1", "Heading2", etc. +- **Include `outlineLevel`** - required for TOC (0 for H1, 1 for H2, etc.) + +--- + +## Editing Existing Documents + +**Follow all 3 steps in order.** + +### Step 1: Unpack +```bash +python scripts/office/unpack.py document.docx unpacked/ +``` +Extracts XML, pretty-prints, merges adjacent runs, and converts smart quotes to XML entities (`“` etc.) so they survive editing. Use `--merge-runs false` to skip run merging. + +### Step 2: Edit XML + +Edit files in `unpacked/word/`. See XML Reference below for patterns. + +**Use "Claude" as the author** for tracked changes and comments, unless the user explicitly requests use of a different name. + +**Use the Edit tool directly for string replacement. Do not write Python scripts.** Scripts introduce unnecessary complexity. The Edit tool shows exactly what is being replaced. + +**CRITICAL: Use smart quotes for new content.** When adding text with apostrophes or quotes, use XML entities to produce smart quotes: +```xml + +Here’s a quote: “Hello” +``` +| Entity | Character | +|--------|-----------| +| `‘` | ‘ (left single) | +| `’` | ’ (right single / apostrophe) | +| `“` | “ (left double) | +| `”` | ” (right double) | + +**Adding comments:** Use `comment.py` to handle boilerplate across multiple XML files (text must be pre-escaped XML): +```bash +python scripts/comment.py unpacked/ 0 "Comment text with & and ’" +python scripts/comment.py unpacked/ 1 "Reply text" --parent 0 # reply to comment 0 +python scripts/comment.py unpacked/ 0 "Text" --author "Custom Author" # custom author name +``` +Then add markers to document.xml (see Comments in XML Reference). + +### Step 3: Pack +```bash +python scripts/office/pack.py unpacked/ output.docx --original document.docx +``` +Validates with auto-repair, condenses XML, and creates DOCX. Use `--validate false` to skip. + +**Auto-repair will fix:** +- `durableId` >= 0x7FFFFFFF (regenerates valid ID) +- Missing `xml:space="preserve"` on `` with whitespace + +**Auto-repair won't fix:** +- Malformed XML, invalid element nesting, missing relationships, schema violations + +### Common Pitfalls + +- **Replace entire `` elements**: When adding tracked changes, replace the whole `...` block with `......` as siblings. Don't inject tracked change tags inside a run. +- **Preserve `` formatting**: Copy the original run's `` block into your tracked change runs to maintain bold, font size, etc. + +--- + +## XML Reference + +### Schema Compliance + +- **Element order in ``**: ``, ``, ``, ``, ``, `` last +- **Whitespace**: Add `xml:space="preserve"` to `` with leading/trailing spaces +- **RSIDs**: Must be 8-digit hex (e.g., `00AB1234`) + +### Tracked Changes + +**Insertion:** +```xml + + inserted text + +``` + +**Deletion:** +```xml + + deleted text + +``` + +**Inside ``**: Use `` instead of ``, and `` instead of ``. + +**Minimal edits** - only mark what changes: +```xml + +The term is + + 30 + + + 60 + + days. +``` + +**Deleting entire paragraphs/list items** - when removing ALL content from a paragraph, also mark the paragraph mark as deleted so it merges with the next paragraph. Add `` inside ``: +```xml + + + ... + + + + + + Entire paragraph content being deleted... + + +``` +Without the `` in ``, accepting changes leaves an empty paragraph/list item. + +**Rejecting another author's insertion** - nest deletion inside their insertion: +```xml + + + their inserted text + + +``` + +**Restoring another author's deletion** - add insertion after (don't modify their deletion): +```xml + + deleted text + + + deleted text + +``` + +### Comments + +After running `comment.py` (see Step 2), add markers to document.xml. For replies, use `--parent` flag and nest markers inside the parent's. + +**CRITICAL: `` and `` are siblings of ``, never inside ``.** + +```xml + + + + deleted + + more text + + + + + + + text + + + + +``` + +### Images + +1. Add image file to `word/media/` +2. Add relationship to `word/_rels/document.xml.rels`: +```xml + +``` +3. Add content type to `[Content_Types].xml`: +```xml + +``` +4. Reference in document.xml: +```xml + + + + + + + + + + + + +``` + +--- + +## Dependencies + +- **pandoc**: Text extraction +- **docx**: `npm install -g docx` (new documents) +- **LibreOffice**: PDF conversion (auto-configured for sandboxed environments via `scripts/office/soffice.py`) +- **Poppler**: `pdftoppm` for images diff --git a/.github/skills/docx/scripts/__init__.py b/.github/skills/docx/scripts/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.github/skills/docx/scripts/__init__.py @@ -0,0 +1 @@ + diff --git a/.github/skills/docx/scripts/accept_changes.py b/.github/skills/docx/scripts/accept_changes.py new file mode 100644 index 0000000..8e36316 --- /dev/null +++ b/.github/skills/docx/scripts/accept_changes.py @@ -0,0 +1,135 @@ +"""Accept all tracked changes in a DOCX file using LibreOffice. + +Requires LibreOffice (soffice) to be installed. +""" + +import argparse +import logging +import shutil +import subprocess +from pathlib import Path + +from office.soffice import get_soffice_env + +logger = logging.getLogger(__name__) + +LIBREOFFICE_PROFILE = "/tmp/libreoffice_docx_profile" +MACRO_DIR = f"{LIBREOFFICE_PROFILE}/user/basic/Standard" + +ACCEPT_CHANGES_MACRO = """ + + + Sub AcceptAllTrackedChanges() + Dim document As Object + Dim dispatcher As Object + + document = ThisComponent.CurrentController.Frame + dispatcher = createUnoService("com.sun.star.frame.DispatchHelper") + + dispatcher.executeDispatch(document, ".uno:AcceptAllTrackedChanges", "", 0, Array()) + ThisComponent.store() + ThisComponent.close(True) + End Sub +""" + + +def accept_changes( + input_file: str, + output_file: str, +) -> tuple[None, str]: + input_path = Path(input_file) + output_path = Path(output_file) + + if not input_path.exists(): + return None, f"Error: Input file not found: {input_file}" + + if not input_path.suffix.lower() == ".docx": + return None, f"Error: Input file is not a DOCX file: {input_file}" + + try: + output_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(input_path, output_path) + except Exception as e: + return None, f"Error: Failed to copy input file to output location: {e}" + + if not _setup_libreoffice_macro(): + return None, "Error: Failed to setup LibreOffice macro" + + cmd = [ + "soffice", + "--headless", + f"-env:UserInstallation=file://{LIBREOFFICE_PROFILE}", + "--norestore", + "vnd.sun.star.script:Standard.Module1.AcceptAllTrackedChanges?language=Basic&location=application", + str(output_path.absolute()), + ] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30, + check=False, + env=get_soffice_env(), + ) + except subprocess.TimeoutExpired: + return ( + None, + f"Successfully accepted all tracked changes: {input_file} -> {output_file}", + ) + + if result.returncode != 0: + return None, f"Error: LibreOffice failed: {result.stderr}" + + return ( + None, + f"Successfully accepted all tracked changes: {input_file} -> {output_file}", + ) + + +def _setup_libreoffice_macro() -> bool: + macro_dir = Path(MACRO_DIR) + macro_file = macro_dir / "Module1.xba" + + if macro_file.exists() and "AcceptAllTrackedChanges" in macro_file.read_text(): + return True + + if not macro_dir.exists(): + subprocess.run( + [ + "soffice", + "--headless", + f"-env:UserInstallation=file://{LIBREOFFICE_PROFILE}", + "--terminate_after_init", + ], + capture_output=True, + timeout=10, + check=False, + env=get_soffice_env(), + ) + macro_dir.mkdir(parents=True, exist_ok=True) + + try: + macro_file.write_text(ACCEPT_CHANGES_MACRO) + return True + except Exception as e: + logger.warning(f"Failed to setup LibreOffice macro: {e}") + return False + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Accept all tracked changes in a DOCX file" + ) + parser.add_argument("input_file", help="Input DOCX file with tracked changes") + parser.add_argument( + "output_file", help="Output DOCX file (clean, no tracked changes)" + ) + args = parser.parse_args() + + _, message = accept_changes(args.input_file, args.output_file) + print(message) + + if "Error" in message: + raise SystemExit(1) diff --git a/.github/skills/docx/scripts/comment.py b/.github/skills/docx/scripts/comment.py new file mode 100644 index 0000000..36e1c93 --- /dev/null +++ b/.github/skills/docx/scripts/comment.py @@ -0,0 +1,318 @@ +"""Add comments to DOCX documents. + +Usage: + python comment.py unpacked/ 0 "Comment text" + python comment.py unpacked/ 1 "Reply text" --parent 0 + +Text should be pre-escaped XML (e.g., & for &, ’ for smart quotes). + +After running, add markers to document.xml: + + ... commented content ... + + +""" + +import argparse +import random +import shutil +import sys +from datetime import datetime, timezone +from pathlib import Path + +import defusedxml.minidom + +TEMPLATE_DIR = Path(__file__).parent / "templates" +NS = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "w14": "http://schemas.microsoft.com/office/word/2010/wordml", + "w15": "http://schemas.microsoft.com/office/word/2012/wordml", + "w16cid": "http://schemas.microsoft.com/office/word/2016/wordml/cid", + "w16cex": "http://schemas.microsoft.com/office/word/2018/wordml/cex", +} + +COMMENT_XML = """\ + + + + + + + + + + + + + {text} + + +""" + +COMMENT_MARKER_TEMPLATE = """ +Add to document.xml (markers must be direct children of w:p, never inside w:r): + + ... + + """ + +REPLY_MARKER_TEMPLATE = """ +Nest markers inside parent {pid}'s markers (markers must be direct children of w:p, never inside w:r): + + ... + + + """ + + +def _generate_hex_id() -> str: + return f"{random.randint(0, 0x7FFFFFFE):08X}" + + +SMART_QUOTE_ENTITIES = { + "\u201c": "“", + "\u201d": "”", + "\u2018": "‘", + "\u2019": "’", +} + + +def _encode_smart_quotes(text: str) -> str: + for char, entity in SMART_QUOTE_ENTITIES.items(): + text = text.replace(char, entity) + return text + + +def _append_xml(xml_path: Path, root_tag: str, content: str) -> None: + dom = defusedxml.minidom.parseString(xml_path.read_text(encoding="utf-8")) + root = dom.getElementsByTagName(root_tag)[0] + ns_attrs = " ".join(f'xmlns:{k}="{v}"' for k, v in NS.items()) + wrapper_dom = defusedxml.minidom.parseString(f"{content}") + for child in wrapper_dom.documentElement.childNodes: + if child.nodeType == child.ELEMENT_NODE: + root.appendChild(dom.importNode(child, True)) + output = _encode_smart_quotes(dom.toxml(encoding="UTF-8").decode("utf-8")) + xml_path.write_text(output, encoding="utf-8") + + +def _find_para_id(comments_path: Path, comment_id: int) -> str | None: + dom = defusedxml.minidom.parseString(comments_path.read_text(encoding="utf-8")) + for c in dom.getElementsByTagName("w:comment"): + if c.getAttribute("w:id") == str(comment_id): + for p in c.getElementsByTagName("w:p"): + if pid := p.getAttribute("w14:paraId"): + return pid + return None + + +def _get_next_rid(rels_path: Path) -> int: + dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8")) + max_rid = 0 + for rel in dom.getElementsByTagName("Relationship"): + rid = rel.getAttribute("Id") + if rid and rid.startswith("rId"): + try: + max_rid = max(max_rid, int(rid[3:])) + except ValueError: + pass + return max_rid + 1 + + +def _has_relationship(rels_path: Path, target: str) -> bool: + dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8")) + for rel in dom.getElementsByTagName("Relationship"): + if rel.getAttribute("Target") == target: + return True + return False + + +def _has_content_type(ct_path: Path, part_name: str) -> bool: + dom = defusedxml.minidom.parseString(ct_path.read_text(encoding="utf-8")) + for override in dom.getElementsByTagName("Override"): + if override.getAttribute("PartName") == part_name: + return True + return False + + +def _ensure_comment_relationships(unpacked_dir: Path) -> None: + rels_path = unpacked_dir / "word" / "_rels" / "document.xml.rels" + if not rels_path.exists(): + return + + if _has_relationship(rels_path, "comments.xml"): + return + + dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8")) + root = dom.documentElement + next_rid = _get_next_rid(rels_path) + + rels = [ + ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments", + "comments.xml", + ), + ( + "http://schemas.microsoft.com/office/2011/relationships/commentsExtended", + "commentsExtended.xml", + ), + ( + "http://schemas.microsoft.com/office/2016/09/relationships/commentsIds", + "commentsIds.xml", + ), + ( + "http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible", + "commentsExtensible.xml", + ), + ] + + for rel_type, target in rels: + rel = dom.createElement("Relationship") + rel.setAttribute("Id", f"rId{next_rid}") + rel.setAttribute("Type", rel_type) + rel.setAttribute("Target", target) + root.appendChild(rel) + next_rid += 1 + + rels_path.write_bytes(dom.toxml(encoding="UTF-8")) + + +def _ensure_comment_content_types(unpacked_dir: Path) -> None: + ct_path = unpacked_dir / "[Content_Types].xml" + if not ct_path.exists(): + return + + if _has_content_type(ct_path, "/word/comments.xml"): + return + + dom = defusedxml.minidom.parseString(ct_path.read_text(encoding="utf-8")) + root = dom.documentElement + + overrides = [ + ( + "/word/comments.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml", + ), + ( + "/word/commentsExtended.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml", + ), + ( + "/word/commentsIds.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml", + ), + ( + "/word/commentsExtensible.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml", + ), + ] + + for part_name, content_type in overrides: + override = dom.createElement("Override") + override.setAttribute("PartName", part_name) + override.setAttribute("ContentType", content_type) + root.appendChild(override) + + ct_path.write_bytes(dom.toxml(encoding="UTF-8")) + + +def add_comment( + unpacked_dir: str, + comment_id: int, + text: str, + author: str = "Claude", + initials: str = "C", + parent_id: int | None = None, +) -> tuple[str, str]: + word = Path(unpacked_dir) / "word" + if not word.exists(): + return "", f"Error: {word} not found" + + para_id, durable_id = _generate_hex_id(), _generate_hex_id() + ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + comments = word / "comments.xml" + first_comment = not comments.exists() + if first_comment: + shutil.copy(TEMPLATE_DIR / "comments.xml", comments) + _ensure_comment_relationships(Path(unpacked_dir)) + _ensure_comment_content_types(Path(unpacked_dir)) + _append_xml( + comments, + "w:comments", + COMMENT_XML.format( + id=comment_id, + author=author, + date=ts, + initials=initials, + para_id=para_id, + text=text, + ), + ) + + ext = word / "commentsExtended.xml" + if not ext.exists(): + shutil.copy(TEMPLATE_DIR / "commentsExtended.xml", ext) + if parent_id is not None: + parent_para = _find_para_id(comments, parent_id) + if not parent_para: + return "", f"Error: Parent comment {parent_id} not found" + _append_xml( + ext, + "w15:commentsEx", + f'', + ) + else: + _append_xml( + ext, + "w15:commentsEx", + f'', + ) + + ids = word / "commentsIds.xml" + if not ids.exists(): + shutil.copy(TEMPLATE_DIR / "commentsIds.xml", ids) + _append_xml( + ids, + "w16cid:commentsIds", + f'', + ) + + extensible = word / "commentsExtensible.xml" + if not extensible.exists(): + shutil.copy(TEMPLATE_DIR / "commentsExtensible.xml", extensible) + _append_xml( + extensible, + "w16cex:commentsExtensible", + f'', + ) + + action = "reply" if parent_id is not None else "comment" + return para_id, f"Added {action} {comment_id} (para_id={para_id})" + + +if __name__ == "__main__": + p = argparse.ArgumentParser(description="Add comments to DOCX documents") + p.add_argument("unpacked_dir", help="Unpacked DOCX directory") + p.add_argument("comment_id", type=int, help="Comment ID (must be unique)") + p.add_argument("text", help="Comment text") + p.add_argument("--author", default="Claude", help="Author name") + p.add_argument("--initials", default="C", help="Author initials") + p.add_argument("--parent", type=int, help="Parent comment ID (for replies)") + args = p.parse_args() + + para_id, msg = add_comment( + args.unpacked_dir, + args.comment_id, + args.text, + args.author, + args.initials, + args.parent, + ) + print(msg) + if "Error" in msg: + sys.exit(1) + cid = args.comment_id + if args.parent is not None: + print(REPLY_MARKER_TEMPLATE.format(pid=args.parent, cid=cid)) + else: + print(COMMENT_MARKER_TEMPLATE.format(cid=cid)) diff --git a/.github/skills/docx/scripts/office/helpers/__init__.py b/.github/skills/docx/scripts/office/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.github/skills/docx/scripts/office/helpers/merge_runs.py b/.github/skills/docx/scripts/office/helpers/merge_runs.py new file mode 100644 index 0000000..ad7c25e --- /dev/null +++ b/.github/skills/docx/scripts/office/helpers/merge_runs.py @@ -0,0 +1,199 @@ +"""Merge adjacent runs with identical formatting in DOCX. + +Merges adjacent elements that have identical properties. +Works on runs in paragraphs and inside tracked changes (, ). + +Also: +- Removes rsid attributes from runs (revision metadata that doesn't affect rendering) +- Removes proofErr elements (spell/grammar markers that block merging) +""" + +from pathlib import Path + +import defusedxml.minidom + + +def merge_runs(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + _remove_elements(root, "proofErr") + _strip_run_rsid_attrs(root) + + containers = {run.parentNode for run in _find_elements(root, "r")} + + merge_count = 0 + for container in containers: + merge_count += _merge_runs_in(container) + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Merged {merge_count} runs" + + except Exception as e: + return 0, f"Error: {e}" + + + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def _get_child(parent, tag: str): + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + return child + return None + + +def _get_children(parent, tag: str) -> list: + results = [] + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(child) + return results + + +def _is_adjacent(elem1, elem2) -> bool: + node = elem1.nextSibling + while node: + if node == elem2: + return True + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + return False + + + + +def _remove_elements(root, tag: str): + for elem in _find_elements(root, tag): + if elem.parentNode: + elem.parentNode.removeChild(elem) + + +def _strip_run_rsid_attrs(root): + for run in _find_elements(root, "r"): + for attr in list(run.attributes.values()): + if "rsid" in attr.name.lower(): + run.removeAttribute(attr.name) + + + + +def _merge_runs_in(container) -> int: + merge_count = 0 + run = _first_child_run(container) + + while run: + while True: + next_elem = _next_element_sibling(run) + if next_elem and _is_run(next_elem) and _can_merge(run, next_elem): + _merge_run_content(run, next_elem) + container.removeChild(next_elem) + merge_count += 1 + else: + break + + _consolidate_text(run) + run = _next_sibling_run(run) + + return merge_count + + +def _first_child_run(container): + for child in container.childNodes: + if child.nodeType == child.ELEMENT_NODE and _is_run(child): + return child + return None + + +def _next_element_sibling(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + return sibling + sibling = sibling.nextSibling + return None + + +def _next_sibling_run(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + if _is_run(sibling): + return sibling + sibling = sibling.nextSibling + return None + + +def _is_run(node) -> bool: + name = node.localName or node.tagName + return name == "r" or name.endswith(":r") + + +def _can_merge(run1, run2) -> bool: + rpr1 = _get_child(run1, "rPr") + rpr2 = _get_child(run2, "rPr") + + if (rpr1 is None) != (rpr2 is None): + return False + if rpr1 is None: + return True + return rpr1.toxml() == rpr2.toxml() + + +def _merge_run_content(target, source): + for child in list(source.childNodes): + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name != "rPr" and not name.endswith(":rPr"): + target.appendChild(child) + + +def _consolidate_text(run): + t_elements = _get_children(run, "t") + + for i in range(len(t_elements) - 1, 0, -1): + curr, prev = t_elements[i], t_elements[i - 1] + + if _is_adjacent(prev, curr): + prev_text = prev.firstChild.data if prev.firstChild else "" + curr_text = curr.firstChild.data if curr.firstChild else "" + merged = prev_text + curr_text + + if prev.firstChild: + prev.firstChild.data = merged + else: + prev.appendChild(run.ownerDocument.createTextNode(merged)) + + if merged.startswith(" ") or merged.endswith(" "): + prev.setAttribute("xml:space", "preserve") + elif prev.hasAttribute("xml:space"): + prev.removeAttribute("xml:space") + + run.removeChild(curr) diff --git a/.github/skills/docx/scripts/office/helpers/simplify_redlines.py b/.github/skills/docx/scripts/office/helpers/simplify_redlines.py new file mode 100644 index 0000000..db963bb --- /dev/null +++ b/.github/skills/docx/scripts/office/helpers/simplify_redlines.py @@ -0,0 +1,197 @@ +"""Simplify tracked changes by merging adjacent w:ins or w:del elements. + +Merges adjacent elements from the same author into a single element. +Same for elements. This makes heavily-redlined documents easier to +work with by reducing the number of tracked change wrappers. + +Rules: +- Only merges w:ins with w:ins, w:del with w:del (same element type) +- Only merges if same author (ignores timestamp differences) +- Only merges if truly adjacent (only whitespace between them) +""" + +import xml.etree.ElementTree as ET +import zipfile +from pathlib import Path + +import defusedxml.minidom + +WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def simplify_redlines(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + merge_count = 0 + + containers = _find_elements(root, "p") + _find_elements(root, "tc") + + for container in containers: + merge_count += _merge_tracked_changes_in(container, "ins") + merge_count += _merge_tracked_changes_in(container, "del") + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Simplified {merge_count} tracked changes" + + except Exception as e: + return 0, f"Error: {e}" + + +def _merge_tracked_changes_in(container, tag: str) -> int: + merge_count = 0 + + tracked = [ + child + for child in container.childNodes + if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag) + ] + + if len(tracked) < 2: + return 0 + + i = 0 + while i < len(tracked) - 1: + curr = tracked[i] + next_elem = tracked[i + 1] + + if _can_merge_tracked(curr, next_elem): + _merge_tracked_content(curr, next_elem) + container.removeChild(next_elem) + tracked.pop(i + 1) + merge_count += 1 + else: + i += 1 + + return merge_count + + +def _is_element(node, tag: str) -> bool: + name = node.localName or node.tagName + return name == tag or name.endswith(f":{tag}") + + +def _get_author(elem) -> str: + author = elem.getAttribute("w:author") + if not author: + for attr in elem.attributes.values(): + if attr.localName == "author" or attr.name.endswith(":author"): + return attr.value + return author + + +def _can_merge_tracked(elem1, elem2) -> bool: + if _get_author(elem1) != _get_author(elem2): + return False + + node = elem1.nextSibling + while node and node != elem2: + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + + return True + + +def _merge_tracked_content(target, source): + while source.firstChild: + child = source.firstChild + source.removeChild(child) + target.appendChild(child) + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]: + if not doc_xml_path.exists(): + return {} + + try: + tree = ET.parse(doc_xml_path) + root = tree.getroot() + except ET.ParseError: + return {} + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + + return authors + + +def _get_authors_from_docx(docx_path: Path) -> dict[str, int]: + try: + with zipfile.ZipFile(docx_path, "r") as zf: + if "word/document.xml" not in zf.namelist(): + return {} + with zf.open("word/document.xml") as f: + tree = ET.parse(f) + root = tree.getroot() + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + return authors + except (zipfile.BadZipFile, ET.ParseError): + return {} + + +def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str: + modified_xml = modified_dir / "word" / "document.xml" + modified_authors = get_tracked_change_authors(modified_xml) + + if not modified_authors: + return default + + original_authors = _get_authors_from_docx(original_docx) + + new_changes: dict[str, int] = {} + for author, count in modified_authors.items(): + original_count = original_authors.get(author, 0) + diff = count - original_count + if diff > 0: + new_changes[author] = diff + + if not new_changes: + return default + + if len(new_changes) == 1: + return next(iter(new_changes)) + + raise ValueError( + f"Multiple authors added new changes: {new_changes}. " + "Cannot infer which author to validate." + ) diff --git a/.github/skills/docx/scripts/office/pack.py b/.github/skills/docx/scripts/office/pack.py new file mode 100644 index 0000000..db29ed8 --- /dev/null +++ b/.github/skills/docx/scripts/office/pack.py @@ -0,0 +1,159 @@ +"""Pack a directory into a DOCX, PPTX, or XLSX file. + +Validates with auto-repair, condenses XML formatting, and creates the Office file. + +Usage: + python pack.py [--original ] [--validate true|false] + +Examples: + python pack.py unpacked/ output.docx --original input.docx + python pack.py unpacked/ output.pptx --validate false +""" + +import argparse +import sys +import shutil +import tempfile +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + +def pack( + input_directory: str, + output_file: str, + original_file: str | None = None, + validate: bool = True, + infer_author_func=None, +) -> tuple[None, str]: + input_dir = Path(input_directory) + output_path = Path(output_file) + suffix = output_path.suffix.lower() + + if not input_dir.is_dir(): + return None, f"Error: {input_dir} is not a directory" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {output_file} must be a .docx, .pptx, or .xlsx file" + + if validate and original_file: + original_path = Path(original_file) + if original_path.exists(): + success, output = _run_validation( + input_dir, original_path, suffix, infer_author_func + ) + if output: + print(output) + if not success: + return None, f"Error: Validation failed for {input_dir}" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + _condense_xml(xml_file) + + output_path.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + return None, f"Successfully packed {input_dir} to {output_file}" + + +def _run_validation( + unpacked_dir: Path, + original_file: Path, + suffix: str, + infer_author_func=None, +) -> tuple[bool, str | None]: + output_lines = [] + validators = [] + + if suffix == ".docx": + author = "Claude" + if infer_author_func: + try: + author = infer_author_func(unpacked_dir, original_file) + except ValueError as e: + print(f"Warning: {e} Using default author 'Claude'.", file=sys.stderr) + + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file), + RedliningValidator(unpacked_dir, original_file, author=author), + ] + elif suffix == ".pptx": + validators = [PPTXSchemaValidator(unpacked_dir, original_file)] + + if not validators: + return True, None + + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + output_lines.append(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + output_lines.append("All validations PASSED!") + + return success, "\n".join(output_lines) if output_lines else None + + +def _condense_xml(xml_file: Path) -> None: + try: + with open(xml_file, encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + for element in dom.getElementsByTagName("*"): + if element.tagName.endswith(":t"): + continue + + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + except Exception as e: + print(f"ERROR: Failed to parse {xml_file.name}: {e}", file=sys.stderr) + raise + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pack a directory into a DOCX, PPTX, or XLSX file" + ) + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument( + "--original", + help="Original file for validation comparison", + ) + parser.add_argument( + "--validate", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Run validation with auto-repair (default: true)", + ) + args = parser.parse_args() + + _, message = pack( + args.input_directory, + args.output_file, + original_file=args.original, + validate=args.validate, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 0000000..6454ef9 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 0000000..afa4f46 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 0000000..64e66b8 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 0000000..687eea8 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 0000000..6ac81b0 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 0000000..1dbf051 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..f1af17d --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..0a185ab --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 0000000..14ef488 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 0000000..c20f3bf --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 0000000..ac60252 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 0000000..424b8ba --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 0000000..2bddce2 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 0000000..8a8c18b --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 0000000..5c42706 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 0000000..853c341 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 0000000..da835ee --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 0000000..87ad265 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 0000000..9e86f1b --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 0000000..d0be42e --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 0000000..8821dd1 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 0000000..ca2575c --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 0000000..dd079e6 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..3dd6cf6 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..f1041e3 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 0000000..9c5b7a6 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 0000000..0f13678 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 0000000..a6de9d2 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 0000000..10e978b --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd b/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 0000000..4248bf7 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd b/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 0000000..5649746 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/mce/mc.xsd b/.github/skills/docx/scripts/office/schemas/mce/mc.xsd new file mode 100644 index 0000000..ef72545 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd b/.github/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd new file mode 100644 index 0000000..f65f777 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd b/.github/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd new file mode 100644 index 0000000..6b00755 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd b/.github/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd new file mode 100644 index 0000000..f321d33 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd b/.github/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 0000000..364c6a9 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd b/.github/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 0000000..fed9d15 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/.github/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd b/.github/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 0000000..680cf15 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/.github/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd b/.github/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 0000000..89ada90 --- /dev/null +++ b/.github/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/.github/skills/docx/scripts/office/soffice.py b/.github/skills/docx/scripts/office/soffice.py new file mode 100644 index 0000000..c7f7e32 --- /dev/null +++ b/.github/skills/docx/scripts/office/soffice.py @@ -0,0 +1,183 @@ +""" +Helper for running LibreOffice (soffice) in environments where AF_UNIX +sockets may be blocked (e.g., sandboxed VMs). Detects the restriction +at runtime and applies an LD_PRELOAD shim if needed. + +Usage: + from office.soffice import run_soffice, get_soffice_env + + # Option 1 – run soffice directly + result = run_soffice(["--headless", "--convert-to", "pdf", "input.docx"]) + + # Option 2 – get env dict for your own subprocess calls + env = get_soffice_env() + subprocess.run(["soffice", ...], env=env) +""" + +import os +import socket +import subprocess +import tempfile +from pathlib import Path + + +def get_soffice_env() -> dict: + env = os.environ.copy() + env["SAL_USE_VCLPLUGIN"] = "svp" + + if _needs_shim(): + shim = _ensure_shim() + env["LD_PRELOAD"] = str(shim) + + return env + + +def run_soffice(args: list[str], **kwargs) -> subprocess.CompletedProcess: + env = get_soffice_env() + return subprocess.run(["soffice"] + args, env=env, **kwargs) + + + +_SHIM_SO = Path(tempfile.gettempdir()) / "lo_socket_shim.so" + + +def _needs_shim() -> bool: + try: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.close() + return False + except OSError: + return True + + +def _ensure_shim() -> Path: + if _SHIM_SO.exists(): + return _SHIM_SO + + src = Path(tempfile.gettempdir()) / "lo_socket_shim.c" + src.write_text(_SHIM_SOURCE) + subprocess.run( + ["gcc", "-shared", "-fPIC", "-o", str(_SHIM_SO), str(src), "-ldl"], + check=True, + capture_output=True, + ) + src.unlink() + return _SHIM_SO + + + +_SHIM_SOURCE = r""" +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +static int (*real_socket)(int, int, int); +static int (*real_socketpair)(int, int, int, int[2]); +static int (*real_listen)(int, int); +static int (*real_accept)(int, struct sockaddr *, socklen_t *); +static int (*real_close)(int); +static int (*real_read)(int, void *, size_t); + +/* Per-FD bookkeeping (FDs >= 1024 are passed through unshimmed). */ +static int is_shimmed[1024]; +static int peer_of[1024]; +static int wake_r[1024]; /* accept() blocks reading this */ +static int wake_w[1024]; /* close() writes to this */ +static int listener_fd = -1; /* FD that received listen() */ + +__attribute__((constructor)) +static void init(void) { + real_socket = dlsym(RTLD_NEXT, "socket"); + real_socketpair = dlsym(RTLD_NEXT, "socketpair"); + real_listen = dlsym(RTLD_NEXT, "listen"); + real_accept = dlsym(RTLD_NEXT, "accept"); + real_close = dlsym(RTLD_NEXT, "close"); + real_read = dlsym(RTLD_NEXT, "read"); + for (int i = 0; i < 1024; i++) { + peer_of[i] = -1; + wake_r[i] = -1; + wake_w[i] = -1; + } +} + +/* ---- socket ---------------------------------------------------------- */ +int socket(int domain, int type, int protocol) { + if (domain == AF_UNIX) { + int fd = real_socket(domain, type, protocol); + if (fd >= 0) return fd; + /* socket(AF_UNIX) blocked – fall back to socketpair(). */ + int sv[2]; + if (real_socketpair(domain, type, protocol, sv) == 0) { + if (sv[0] >= 0 && sv[0] < 1024) { + is_shimmed[sv[0]] = 1; + peer_of[sv[0]] = sv[1]; + int wp[2]; + if (pipe(wp) == 0) { + wake_r[sv[0]] = wp[0]; + wake_w[sv[0]] = wp[1]; + } + } + return sv[0]; + } + errno = EPERM; + return -1; + } + return real_socket(domain, type, protocol); +} + +/* ---- listen ---------------------------------------------------------- */ +int listen(int sockfd, int backlog) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + listener_fd = sockfd; + return 0; + } + return real_listen(sockfd, backlog); +} + +/* ---- accept ---------------------------------------------------------- */ +int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + /* Block until close() writes to the wake pipe. */ + if (wake_r[sockfd] >= 0) { + char buf; + real_read(wake_r[sockfd], &buf, 1); + } + errno = ECONNABORTED; + return -1; + } + return real_accept(sockfd, addr, addrlen); +} + +/* ---- close ----------------------------------------------------------- */ +int close(int fd) { + if (fd >= 0 && fd < 1024 && is_shimmed[fd]) { + int was_listener = (fd == listener_fd); + is_shimmed[fd] = 0; + + if (wake_w[fd] >= 0) { /* unblock accept() */ + char c = 0; + write(wake_w[fd], &c, 1); + real_close(wake_w[fd]); + wake_w[fd] = -1; + } + if (wake_r[fd] >= 0) { real_close(wake_r[fd]); wake_r[fd] = -1; } + if (peer_of[fd] >= 0) { real_close(peer_of[fd]); peer_of[fd] = -1; } + + if (was_listener) + _exit(0); /* conversion done – exit */ + } + return real_close(fd); +} +""" + + + +if __name__ == "__main__": + import sys + result = run_soffice(sys.argv[1:]) + sys.exit(result.returncode) diff --git a/.github/skills/docx/scripts/office/unpack.py b/.github/skills/docx/scripts/office/unpack.py new file mode 100644 index 0000000..0015253 --- /dev/null +++ b/.github/skills/docx/scripts/office/unpack.py @@ -0,0 +1,132 @@ +"""Unpack Office files (DOCX, PPTX, XLSX) for editing. + +Extracts the ZIP archive, pretty-prints XML files, and optionally: +- Merges adjacent runs with identical formatting (DOCX only) +- Simplifies adjacent tracked changes from same author (DOCX only) + +Usage: + python unpack.py [options] + +Examples: + python unpack.py document.docx unpacked/ + python unpack.py presentation.pptx unpacked/ + python unpack.py document.docx unpacked/ --merge-runs false +""" + +import argparse +import sys +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from helpers.merge_runs import merge_runs as do_merge_runs +from helpers.simplify_redlines import simplify_redlines as do_simplify_redlines + +SMART_QUOTE_REPLACEMENTS = { + "\u201c": "“", + "\u201d": "”", + "\u2018": "‘", + "\u2019": "’", +} + + +def unpack( + input_file: str, + output_directory: str, + merge_runs: bool = True, + simplify_redlines: bool = True, +) -> tuple[None, str]: + input_path = Path(input_file) + output_path = Path(output_directory) + suffix = input_path.suffix.lower() + + if not input_path.exists(): + return None, f"Error: {input_file} does not exist" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {input_file} must be a .docx, .pptx, or .xlsx file" + + try: + output_path.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(input_path, "r") as zf: + zf.extractall(output_path) + + xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) + for xml_file in xml_files: + _pretty_print_xml(xml_file) + + message = f"Unpacked {input_file} ({len(xml_files)} XML files)" + + if suffix == ".docx": + if simplify_redlines: + simplify_count, _ = do_simplify_redlines(str(output_path)) + message += f", simplified {simplify_count} tracked changes" + + if merge_runs: + merge_count, _ = do_merge_runs(str(output_path)) + message += f", merged {merge_count} runs" + + for xml_file in xml_files: + _escape_smart_quotes(xml_file) + + return None, message + + except zipfile.BadZipFile: + return None, f"Error: {input_file} is not a valid Office file" + except Exception as e: + return None, f"Error unpacking: {e}" + + +def _pretty_print_xml(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="utf-8")) + except Exception: + pass + + +def _escape_smart_quotes(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + for char, entity in SMART_QUOTE_REPLACEMENTS.items(): + content = content.replace(char, entity) + xml_file.write_text(content, encoding="utf-8") + except Exception: + pass + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Unpack an Office file (DOCX, PPTX, XLSX) for editing" + ) + parser.add_argument("input_file", help="Office file to unpack") + parser.add_argument("output_directory", help="Output directory") + parser.add_argument( + "--merge-runs", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent runs with identical formatting (DOCX only, default: true)", + ) + parser.add_argument( + "--simplify-redlines", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent tracked changes from same author (DOCX only, default: true)", + ) + args = parser.parse_args() + + _, message = unpack( + args.input_file, + args.output_directory, + merge_runs=args.merge_runs, + simplify_redlines=args.simplify_redlines, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/.github/skills/docx/scripts/office/validate.py b/.github/skills/docx/scripts/office/validate.py new file mode 100644 index 0000000..03b01f6 --- /dev/null +++ b/.github/skills/docx/scripts/office/validate.py @@ -0,0 +1,111 @@ +""" +Command line tool to validate Office document XML files against XSD schemas and tracked changes. + +Usage: + python validate.py [--original ] [--auto-repair] [--author NAME] + +The first argument can be either: +- An unpacked directory containing the Office document XML files +- A packed Office file (.docx/.pptx/.xlsx) which will be unpacked to a temp directory + +Auto-repair fixes: +- paraId/durableId values that exceed OOXML limits +- Missing xml:space="preserve" on w:t elements with whitespace +""" + +import argparse +import sys +import tempfile +import zipfile +from pathlib import Path + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Validate Office document XML files") + parser.add_argument( + "path", + help="Path to unpacked directory or packed Office file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "--original", + required=False, + default=None, + help="Path to original file (.docx/.pptx/.xlsx). If omitted, all XSD errors are reported and redlining validation is skipped.", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--auto-repair", + action="store_true", + help="Automatically repair common issues (hex IDs, whitespace preservation)", + ) + parser.add_argument( + "--author", + default="Claude", + help="Author name for redlining validation (default: Claude)", + ) + args = parser.parse_args() + + path = Path(args.path) + assert path.exists(), f"Error: {path} does not exist" + + original_file = None + if args.original: + original_file = Path(args.original) + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert original_file.suffix.lower() in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + file_extension = (original_file or path).suffix.lower() + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: Cannot determine file type from {path}. Use --original or provide a .docx/.pptx/.xlsx file." + ) + + if path.is_file() and path.suffix.lower() in [".docx", ".pptx", ".xlsx"]: + temp_dir = tempfile.mkdtemp() + with zipfile.ZipFile(path, "r") as zf: + zf.extractall(temp_dir) + unpacked_dir = Path(temp_dir) + else: + assert path.is_dir(), f"Error: {path} is not a directory or Office file" + unpacked_dir = path + + match file_extension: + case ".docx": + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + if original_file: + validators.append( + RedliningValidator(unpacked_dir, original_file, verbose=args.verbose, author=args.author) + ) + case ".pptx": + validators = [ + PPTXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + if args.auto_repair: + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + print(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/.github/skills/docx/scripts/office/validators/__init__.py b/.github/skills/docx/scripts/office/validators/__init__.py new file mode 100644 index 0000000..db092ec --- /dev/null +++ b/.github/skills/docx/scripts/office/validators/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/.github/skills/docx/scripts/office/validators/base.py b/.github/skills/docx/scripts/office/validators/base.py new file mode 100644 index 0000000..db4a06a --- /dev/null +++ b/.github/skills/docx/scripts/office/validators/base.py @@ -0,0 +1,847 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import defusedxml.minidom +import lxml.etree + + +class BaseSchemaValidator: + + IGNORED_VALIDATION_ERRORS = [ + "hyphenationZone", + "purl.org/dc/terms", + ] + + UNIQUE_ID_REQUIREMENTS = { + "comment": ("id", "file"), + "commentrangestart": ("id", "file"), + "commentrangeend": ("id", "file"), + "bookmarkstart": ("id", "file"), + "bookmarkend": ("id", "file"), + "sldid": ("id", "file"), + "sldmasterid": ("id", "global"), + "sldlayoutid": ("id", "global"), + "cm": ("authorid", "file"), + "sheet": ("sheetid", "file"), + "definedname": ("id", "file"), + "cxnsp": ("id", "file"), + "sp": ("id", "file"), + "pic": ("id", "file"), + "grpsp": ("id", "file"), + } + + EXCLUDED_ID_CONTAINERS = { + "sectionlst", + } + + ELEMENT_RELATIONSHIP_TYPES = {} + + SCHEMA_MAPPINGS = { + "word": "ISO-IEC29500-4_2016/wml.xsd", + "ppt": "ISO-IEC29500-4_2016/pml.xsd", + "xl": "ISO-IEC29500-4_2016/sml.xsd", + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file=None, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) if original_file else None + self.verbose = verbose + + self.schemas_dir = Path(__file__).parent.parent / "schemas" + + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + raise NotImplementedError("Subclasses must implement the validate method") + + def repair(self) -> int: + return self.repair_whitespace_preservation() + + def repair_whitespace_preservation(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if elem.tagName.endswith(":t") and elem.firstChild: + text = elem.firstChild.nodeValue + if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))): + if elem.getAttribute("xml:space") != "preserve": + elem.setAttribute("xml:space", "preserve") + text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text) + print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}") + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + def validate_xml(self): + errors = [] + + for xml_file in self.xml_files: + try: + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + errors = [] + global_ids = {} + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} + + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + for elem in root.iter(): + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + if tag in self.UNIQUE_ID_REQUIREMENTS: + in_excluded_container = any( + ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS + for ancestor in elem.iterancestors() + ) + if in_excluded_container: + continue + + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + errors = [] + + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): + all_files.append(file_path.resolve()) + + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + for rels_file in rels_files: + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + rels_dir = rels_file.parent + + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): + if target.startswith("/"): + target_path = self.unpacked_dir / target.lstrip("/") + elif rels_file.name == ".rels": + target_path = self.unpacked_dir / target + else: + base_dir = rels_dir.parent + target_path = base_dir / target + + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + import lxml.etree + + errors = [] + + for xml_file in self.xml_files: + if xml_file.suffix == ".rels": + continue + + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + if not rels_file.exists(): + continue + + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE + rid_attrs_to_check = ["id", "embed", "link"] + for elem in xml_root.iter(): + for attr_name in rid_attrs_to_check: + rid_attr = elem.get(f"{{{r_ns}}}{attr_name}") + if not rid_attr: + continue + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + elem_lower = element_name.lower() + + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + if elem_lower.endswith("id") and len(elem_lower) > 2: + prefix = elem_lower[:-2] + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + if prefix == "sld": + return "slide" + return prefix.lower() + + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] + return prefix.lower() + + return None + + def validate_content_types(self): + errors = [] + + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", + "document", + "workbook", + "worksheet", + "theme", + } + + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue + + for file_path in all_files: + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() + elif is_valid: + return True, set() + + original_errors = self._get_original_file_errors(xml_file) + + assert current_errors is not None + new_errors = current_errors - original_errors + + new_errors = { + e for e in new_errors + if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS) + } + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + original_error_count += 1 + valid_count += 1 + continue + + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + for attr in attrs_to_remove: + del elem.attrib[attr] + + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + elements_to_remove = [] + + for elem in list(root): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + self._remove_ignorable_elements(elem) + + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + root = xml_doc.getroot() + + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None + + try: + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + if self.original_file is None: + return set() + + import tempfile + import zipfile + + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + return set() + + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + for elem in xml_copy.iter(): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/docx/scripts/office/validators/docx.py b/.github/skills/docx/scripts/office/validators/docx.py new file mode 100644 index 0000000..fec405e --- /dev/null +++ b/.github/skills/docx/scripts/office/validators/docx.py @@ -0,0 +1,446 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import random +import re +import tempfile +import zipfile + +import defusedxml.minidom +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml" + W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid" + + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_whitespace_preservation(): + all_valid = False + + if not self.validate_deletions(): + all_valid = False + + if not self.validate_insertions(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_id_constraints(): + all_valid = False + + if not self.validate_comment_markers(): + all_valid = False + + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + if re.search(r"^[ \t\n\r]", text) or re.search( + r"[ \t\n\r]$", text + ): + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces): + if t_elem.text: + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + for instr_elem in root.xpath( + ".//w:del//w:instrText", namespaces=namespaces + ): + text_preview = ( + repr(instr_elem.text or "")[:50] + "..." + if len(repr(instr_elem.text or "")) > 50 + else repr(instr_elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {instr_elem.sourceline}: found within (use ): {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + count = 0 + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + original = self.original_file + if original is None: + return 0 + + count = 0 + + try: + with tempfile.TemporaryDirectory() as temp_dir: + with zipfile.ZipFile(original, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + def _parse_id_value(self, val: str, base: int = 16) -> int: + return int(val, base) + + def validate_id_constraints(self): + errors = [] + para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId" + durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId" + + for xml_file in self.xml_files: + try: + for elem in lxml.etree.parse(str(xml_file)).iter(): + if val := elem.get(para_id_attr): + if self._parse_id_value(val, base=16) >= 0x80000000: + errors.append( + f" {xml_file.name}:{elem.sourceline}: paraId={val} >= 0x80000000" + ) + + if val := elem.get(durable_id_attr): + if xml_file.name == "numbering.xml": + try: + if self._parse_id_value(val, base=10) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except ValueError: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} must be decimal in numbering.xml" + ) + else: + if self._parse_id_value(val, base=16) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except Exception: + pass + + if errors: + print(f"FAILED - {len(errors)} ID constraint violations:") + for e in errors: + print(e) + elif self.verbose: + print("PASSED - All paraId/durableId values within constraints") + return not errors + + def validate_comment_markers(self): + errors = [] + + document_xml = None + comments_xml = None + for xml_file in self.xml_files: + if xml_file.name == "document.xml" and "word" in str(xml_file): + document_xml = xml_file + elif xml_file.name == "comments.xml": + comments_xml = xml_file + + if not document_xml: + if self.verbose: + print("PASSED - No document.xml found (skipping comment validation)") + return True + + try: + doc_root = lxml.etree.parse(str(document_xml)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + range_starts = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeStart", namespaces=namespaces + ) + } + range_ends = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeEnd", namespaces=namespaces + ) + } + references = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentReference", namespaces=namespaces + ) + } + + orphaned_ends = range_ends - range_starts + for comment_id in sorted( + orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart' + ) + + orphaned_starts = range_starts - range_ends + for comment_id in sorted( + orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd' + ) + + comment_ids = set() + if comments_xml and comments_xml.exists(): + comments_root = lxml.etree.parse(str(comments_xml)).getroot() + comment_ids = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in comments_root.xpath( + ".//w:comment", namespaces=namespaces + ) + } + + marker_ids = range_starts | range_ends | references + invalid_refs = marker_ids - comment_ids + for comment_id in sorted( + invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + if comment_id: + errors.append( + f' document.xml: marker id="{comment_id}" references non-existent comment' + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append(f" Error parsing XML: {e}") + + if errors: + print(f"FAILED - {len(errors)} comment marker violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All comment markers properly paired") + return True + + def repair(self) -> int: + repairs = super().repair() + repairs += self.repair_durableId() + return repairs + + def repair_durableId(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if not elem.hasAttribute("w16cid:durableId"): + continue + + durable_id = elem.getAttribute("w16cid:durableId") + needs_repair = False + + if xml_file.name == "numbering.xml": + try: + needs_repair = ( + self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + else: + try: + needs_repair = ( + self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + + if needs_repair: + value = random.randint(1, 0x7FFFFFFE) + if xml_file.name == "numbering.xml": + new_id = str(value) + else: + new_id = f"{value:08X}" + + elem.setAttribute("w16cid:durableId", new_id) + print( + f" Repaired: {xml_file.name}: durableId {durable_id} → {new_id}" + ) + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/docx/scripts/office/validators/pptx.py b/.github/skills/docx/scripts/office/validators/pptx.py new file mode 100644 index 0000000..09842aa --- /dev/null +++ b/.github/skills/docx/scripts/office/validators/pptx.py @@ -0,0 +1,275 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_uuid_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_slide_layout_ids(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_notes_slide_references(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + import lxml.etree + + errors = [] + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(): + for attr, value in elem.attrib.items(): + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + if self._looks_like_uuid(value): + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + clean_value = value.strip("{}()").replace("-", "") + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + import lxml.etree + + errors = [] + + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + root = lxml.etree.parse(str(slide_master)).getroot() + + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + import lxml.etree + + errors = [] + notes_slide_references = {} + + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + normalized_target = target.replace("../", "") + + slide_name = rels_file.stem.replace( + ".xml", "" + ) + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/docx/scripts/office/validators/redlining.py b/.github/skills/docx/scripts/office/validators/redlining.py new file mode 100644 index 0000000..71c81b6 --- /dev/null +++ b/.github/skills/docx/scripts/office/validators/redlining.py @@ -0,0 +1,247 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + + def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.author = author + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def repair(self) -> int: + return 0 + + def validate(self): + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + author_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + author_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + + if not author_del_elements and not author_ins_elements: + if self.verbose: + print(f"PASSED - No tracked changes by {self.author} found.") + return True + + except Exception: + pass + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + self._remove_author_tracked_changes(original_root) + self._remove_author_tracked_changes(modified_root) + + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print(f"PASSED - All changes by {self.author} are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + error_parts = [ + f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + pass + + return None + + def _remove_author_tracked_changes(self, root): + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == self.author: + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == self.author: + to_process.append((child, list(parent).index(child))) + + for del_elem, del_index in reversed(to_process): + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/docx/scripts/templates/comments.xml b/.github/skills/docx/scripts/templates/comments.xml new file mode 100644 index 0000000..cd01a7d --- /dev/null +++ b/.github/skills/docx/scripts/templates/comments.xml @@ -0,0 +1,3 @@ + + + diff --git a/.github/skills/docx/scripts/templates/commentsExtended.xml b/.github/skills/docx/scripts/templates/commentsExtended.xml new file mode 100644 index 0000000..411003c --- /dev/null +++ b/.github/skills/docx/scripts/templates/commentsExtended.xml @@ -0,0 +1,3 @@ + + + diff --git a/.github/skills/docx/scripts/templates/commentsExtensible.xml b/.github/skills/docx/scripts/templates/commentsExtensible.xml new file mode 100644 index 0000000..f5572d7 --- /dev/null +++ b/.github/skills/docx/scripts/templates/commentsExtensible.xml @@ -0,0 +1,3 @@ + + + diff --git a/.github/skills/docx/scripts/templates/commentsIds.xml b/.github/skills/docx/scripts/templates/commentsIds.xml new file mode 100644 index 0000000..32f1629 --- /dev/null +++ b/.github/skills/docx/scripts/templates/commentsIds.xml @@ -0,0 +1,3 @@ + + + diff --git a/.github/skills/docx/scripts/templates/people.xml b/.github/skills/docx/scripts/templates/people.xml new file mode 100644 index 0000000..3803d2d --- /dev/null +++ b/.github/skills/docx/scripts/templates/people.xml @@ -0,0 +1,3 @@ + + + diff --git a/.github/skills/pdf/LICENSE.txt b/.github/skills/pdf/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/.github/skills/pdf/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/.github/skills/pdf/SKILL.md b/.github/skills/pdf/SKILL.md new file mode 100644 index 0000000..d3e046a --- /dev/null +++ b/.github/skills/pdf/SKILL.md @@ -0,0 +1,314 @@ +--- +name: pdf +description: Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill. +license: Proprietary. LICENSE.txt has complete terms +--- + +# PDF Processing Guide + +## Overview + +This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions. + +## Quick Start + +```python +from pypdf import PdfReader, PdfWriter + +# Read a PDF +reader = PdfReader("document.pdf") +print(f"Pages: {len(reader.pages)}") + +# Extract text +text = "" +for page in reader.pages: + text += page.extract_text() +``` + +## Python Libraries + +### pypdf - Basic Operations + +#### Merge PDFs +```python +from pypdf import PdfWriter, PdfReader + +writer = PdfWriter() +for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + +with open("merged.pdf", "wb") as output: + writer.write(output) +``` + +#### Split PDF +```python +reader = PdfReader("input.pdf") +for i, page in enumerate(reader.pages): + writer = PdfWriter() + writer.add_page(page) + with open(f"page_{i+1}.pdf", "wb") as output: + writer.write(output) +``` + +#### Extract Metadata +```python +reader = PdfReader("document.pdf") +meta = reader.metadata +print(f"Title: {meta.title}") +print(f"Author: {meta.author}") +print(f"Subject: {meta.subject}") +print(f"Creator: {meta.creator}") +``` + +#### Rotate Pages +```python +reader = PdfReader("input.pdf") +writer = PdfWriter() + +page = reader.pages[0] +page.rotate(90) # Rotate 90 degrees clockwise +writer.add_page(page) + +with open("rotated.pdf", "wb") as output: + writer.write(output) +``` + +### pdfplumber - Text and Table Extraction + +#### Extract Text with Layout +```python +import pdfplumber + +with pdfplumber.open("document.pdf") as pdf: + for page in pdf.pages: + text = page.extract_text() + print(text) +``` + +#### Extract Tables +```python +with pdfplumber.open("document.pdf") as pdf: + for i, page in enumerate(pdf.pages): + tables = page.extract_tables() + for j, table in enumerate(tables): + print(f"Table {j+1} on page {i+1}:") + for row in table: + print(row) +``` + +#### Advanced Table Extraction +```python +import pandas as pd + +with pdfplumber.open("document.pdf") as pdf: + all_tables = [] + for page in pdf.pages: + tables = page.extract_tables() + for table in tables: + if table: # Check if table is not empty + df = pd.DataFrame(table[1:], columns=table[0]) + all_tables.append(df) + +# Combine all tables +if all_tables: + combined_df = pd.concat(all_tables, ignore_index=True) + combined_df.to_excel("extracted_tables.xlsx", index=False) +``` + +### reportlab - Create PDFs + +#### Basic PDF Creation +```python +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + +c = canvas.Canvas("hello.pdf", pagesize=letter) +width, height = letter + +# Add text +c.drawString(100, height - 100, "Hello World!") +c.drawString(100, height - 120, "This is a PDF created with reportlab") + +# Add a line +c.line(100, height - 140, 400, height - 140) + +# Save +c.save() +``` + +#### Create PDF with Multiple Pages +```python +from reportlab.lib.pagesizes import letter +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak +from reportlab.lib.styles import getSampleStyleSheet + +doc = SimpleDocTemplate("report.pdf", pagesize=letter) +styles = getSampleStyleSheet() +story = [] + +# Add content +title = Paragraph("Report Title", styles['Title']) +story.append(title) +story.append(Spacer(1, 12)) + +body = Paragraph("This is the body of the report. " * 20, styles['Normal']) +story.append(body) +story.append(PageBreak()) + +# Page 2 +story.append(Paragraph("Page 2", styles['Heading1'])) +story.append(Paragraph("Content for page 2", styles['Normal'])) + +# Build PDF +doc.build(story) +``` + +#### Subscripts and Superscripts + +**IMPORTANT**: Never use Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes. + +Instead, use ReportLab's XML markup tags in Paragraph objects: +```python +from reportlab.platypus import Paragraph +from reportlab.lib.styles import getSampleStyleSheet + +styles = getSampleStyleSheet() + +# Subscripts: use tag +chemical = Paragraph("H2O", styles['Normal']) + +# Superscripts: use tag +squared = Paragraph("x2 + y2", styles['Normal']) +``` + +For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts. + +## Command-Line Tools + +### pdftotext (poppler-utils) +```bash +# Extract text +pdftotext input.pdf output.txt + +# Extract text preserving layout +pdftotext -layout input.pdf output.txt + +# Extract specific pages +pdftotext -f 1 -l 5 input.pdf output.txt # Pages 1-5 +``` + +### qpdf +```bash +# Merge PDFs +qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf + +# Split pages +qpdf input.pdf --pages . 1-5 -- pages1-5.pdf +qpdf input.pdf --pages . 6-10 -- pages6-10.pdf + +# Rotate pages +qpdf input.pdf output.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees + +# Remove password +qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf +``` + +### pdftk (if available) +```bash +# Merge +pdftk file1.pdf file2.pdf cat output merged.pdf + +# Split +pdftk input.pdf burst + +# Rotate +pdftk input.pdf rotate 1east output rotated.pdf +``` + +## Common Tasks + +### Extract Text from Scanned PDFs +```python +# Requires: pip install pytesseract pdf2image +import pytesseract +from pdf2image import convert_from_path + +# Convert PDF to images +images = convert_from_path('scanned.pdf') + +# OCR each page +text = "" +for i, image in enumerate(images): + text += f"Page {i+1}:\n" + text += pytesseract.image_to_string(image) + text += "\n\n" + +print(text) +``` + +### Add Watermark +```python +from pypdf import PdfReader, PdfWriter + +# Create watermark (or load existing) +watermark = PdfReader("watermark.pdf").pages[0] + +# Apply to all pages +reader = PdfReader("document.pdf") +writer = PdfWriter() + +for page in reader.pages: + page.merge_page(watermark) + writer.add_page(page) + +with open("watermarked.pdf", "wb") as output: + writer.write(output) +``` + +### Extract Images +```bash +# Using pdfimages (poppler-utils) +pdfimages -j input.pdf output_prefix + +# This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc. +``` + +### Password Protection +```python +from pypdf import PdfReader, PdfWriter + +reader = PdfReader("input.pdf") +writer = PdfWriter() + +for page in reader.pages: + writer.add_page(page) + +# Add password +writer.encrypt("userpassword", "ownerpassword") + +with open("encrypted.pdf", "wb") as output: + writer.write(output) +``` + +## Quick Reference + +| Task | Best Tool | Command/Code | +|------|-----------|--------------| +| Merge PDFs | pypdf | `writer.add_page(page)` | +| Split PDFs | pypdf | One page per file | +| Extract text | pdfplumber | `page.extract_text()` | +| Extract tables | pdfplumber | `page.extract_tables()` | +| Create PDFs | reportlab | Canvas or Platypus | +| Command line merge | qpdf | `qpdf --empty --pages ...` | +| OCR scanned PDFs | pytesseract | Convert to image first | +| Fill PDF forms | pdf-lib or pypdf (see FORMS.md) | See FORMS.md | + +## Next Steps + +- For advanced pypdfium2 usage, see REFERENCE.md +- For JavaScript libraries (pdf-lib), see REFERENCE.md +- If you need to fill out a PDF form, follow the instructions in FORMS.md +- For troubleshooting guides, see REFERENCE.md diff --git a/.github/skills/pdf/forms.md b/.github/skills/pdf/forms.md new file mode 100644 index 0000000..6e7e1e0 --- /dev/null +++ b/.github/skills/pdf/forms.md @@ -0,0 +1,294 @@ +**CRITICAL: You MUST complete these steps in order. Do not skip ahead to writing code.** + +If you need to fill out a PDF form, first check to see if the PDF has fillable form fields. Run this script from this file's directory: + `python scripts/check_fillable_fields `, and depending on the result go to either the "Fillable fields" or "Non-fillable fields" and follow those instructions. + +# Fillable fields +If the PDF has fillable form fields: +- Run this script from this file's directory: `python scripts/extract_form_field_info.py `. It will create a JSON file with a list of fields in this format: +``` +[ + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "rect": ([left, bottom, right, top] bounding box in PDF coordinates, y=0 is the bottom of the page), + "type": ("text", "checkbox", "radio_group", or "choice"), + }, + // Checkboxes have "checked_value" and "unchecked_value" properties: + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "checkbox", + "checked_value": (Set the field to this value to check the checkbox), + "unchecked_value": (Set the field to this value to uncheck the checkbox), + }, + // Radio groups have a "radio_options" list with the possible choices. + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "radio_group", + "radio_options": [ + { + "value": (set the field to this value to select this radio option), + "rect": (bounding box for the radio button for this option) + }, + // Other radio options + ] + }, + // Multiple choice fields have a "choice_options" list with the possible choices: + { + "field_id": (unique ID for the field), + "page": (page number, 1-based), + "type": "choice", + "choice_options": [ + { + "value": (set the field to this value to select this option), + "text": (display text of the option) + }, + // Other choice options + ], + } +] +``` +- Convert the PDF to PNGs (one image for each page) with this script (run from this file's directory): +`python scripts/convert_pdf_to_images.py ` +Then analyze the images to determine the purpose of each form field (make sure to convert the bounding box PDF coordinates to image coordinates). +- Create a `field_values.json` file in this format with the values to be entered for each field: +``` +[ + { + "field_id": "last_name", // Must match the field_id from `extract_form_field_info.py` + "description": "The user's last name", + "page": 1, // Must match the "page" value in field_info.json + "value": "Simpson" + }, + { + "field_id": "Checkbox12", + "description": "Checkbox to be checked if the user is 18 or over", + "page": 1, + "value": "/On" // If this is a checkbox, use its "checked_value" value to check it. If it's a radio button group, use one of the "value" values in "radio_options". + }, + // more fields +] +``` +- Run the `fill_fillable_fields.py` script from this file's directory to create a filled-in PDF: +`python scripts/fill_fillable_fields.py ` +This script will verify that the field IDs and values you provide are valid; if it prints error messages, correct the appropriate fields and try again. + +# Non-fillable fields +If the PDF doesn't have fillable form fields, you'll add text annotations. First try to extract coordinates from the PDF structure (more accurate), then fall back to visual estimation if needed. + +## Step 1: Try Structure Extraction First + +Run this script to extract text labels, lines, and checkboxes with their exact PDF coordinates: +`python scripts/extract_form_structure.py form_structure.json` + +This creates a JSON file containing: +- **labels**: Every text element with exact coordinates (x0, top, x1, bottom in PDF points) +- **lines**: Horizontal lines that define row boundaries +- **checkboxes**: Small square rectangles that are checkboxes (with center coordinates) +- **row_boundaries**: Row top/bottom positions calculated from horizontal lines + +**Check the results**: If `form_structure.json` has meaningful labels (text elements that correspond to form fields), use **Approach A: Structure-Based Coordinates**. If the PDF is scanned/image-based and has few or no labels, use **Approach B: Visual Estimation**. + +--- + +## Approach A: Structure-Based Coordinates (Preferred) + +Use this when `extract_form_structure.py` found text labels in the PDF. + +### A.1: Analyze the Structure + +Read form_structure.json and identify: + +1. **Label groups**: Adjacent text elements that form a single label (e.g., "Last" + "Name") +2. **Row structure**: Labels with similar `top` values are in the same row +3. **Field columns**: Entry areas start after label ends (x0 = label.x1 + gap) +4. **Checkboxes**: Use the checkbox coordinates directly from the structure + +**Coordinate system**: PDF coordinates where y=0 is at TOP of page, y increases downward. + +### A.2: Check for Missing Elements + +The structure extraction may not detect all form elements. Common cases: +- **Circular checkboxes**: Only square rectangles are detected as checkboxes +- **Complex graphics**: Decorative elements or non-standard form controls +- **Faded or light-colored elements**: May not be extracted + +If you see form fields in the PDF images that aren't in form_structure.json, you'll need to use **visual analysis** for those specific fields (see "Hybrid Approach" below). + +### A.3: Create fields.json with PDF Coordinates + +For each field, calculate entry coordinates from the extracted structure: + +**Text fields:** +- entry x0 = label x1 + 5 (small gap after label) +- entry x1 = next label's x0, or row boundary +- entry top = same as label top +- entry bottom = row boundary line below, or label bottom + row_height + +**Checkboxes:** +- Use the checkbox rectangle coordinates directly from form_structure.json +- entry_bounding_box = [checkbox.x0, checkbox.top, checkbox.x1, checkbox.bottom] + +Create fields.json using `pdf_width` and `pdf_height` (signals PDF coordinates): +```json +{ + "pages": [ + {"page_number": 1, "pdf_width": 612, "pdf_height": 792} + ], + "form_fields": [ + { + "page_number": 1, + "description": "Last name entry field", + "field_label": "Last Name", + "label_bounding_box": [43, 63, 87, 73], + "entry_bounding_box": [92, 63, 260, 79], + "entry_text": {"text": "Smith", "font_size": 10} + }, + { + "page_number": 1, + "description": "US Citizen Yes checkbox", + "field_label": "Yes", + "label_bounding_box": [260, 200, 280, 210], + "entry_bounding_box": [285, 197, 292, 205], + "entry_text": {"text": "X"} + } + ] +} +``` + +**Important**: Use `pdf_width`/`pdf_height` and coordinates directly from form_structure.json. + +### A.4: Validate Bounding Boxes + +Before filling, check your bounding boxes for errors: +`python scripts/check_bounding_boxes.py fields.json` + +This checks for intersecting bounding boxes and entry boxes that are too small for the font size. Fix any reported errors before filling. + +--- + +## Approach B: Visual Estimation (Fallback) + +Use this when the PDF is scanned/image-based and structure extraction found no usable text labels (e.g., all text shows as "(cid:X)" patterns). + +### B.1: Convert PDF to Images + +`python scripts/convert_pdf_to_images.py ` + +### B.2: Initial Field Identification + +Examine each page image to identify form sections and get **rough estimates** of field locations: +- Form field labels and their approximate positions +- Entry areas (lines, boxes, or blank spaces for text input) +- Checkboxes and their approximate locations + +For each field, note approximate pixel coordinates (they don't need to be precise yet). + +### B.3: Zoom Refinement (CRITICAL for accuracy) + +For each field, crop a region around the estimated position to refine coordinates precisely. + +**Create a zoomed crop using ImageMagick:** +```bash +magick -crop x++ +repage +``` + +Where: +- `, ` = top-left corner of crop region (use your rough estimate minus padding) +- `, ` = size of crop region (field area plus ~50px padding on each side) + +**Example:** To refine a "Name" field estimated around (100, 150): +```bash +magick images_dir/page_1.png -crop 300x80+50+120 +repage crops/name_field.png +``` + +(Note: if the `magick` command isn't available, try `convert` with the same arguments). + +**Examine the cropped image** to determine precise coordinates: +1. Identify the exact pixel where the entry area begins (after the label) +2. Identify where the entry area ends (before next field or edge) +3. Identify the top and bottom of the entry line/box + +**Convert crop coordinates back to full image coordinates:** +- full_x = crop_x + crop_offset_x +- full_y = crop_y + crop_offset_y + +Example: If the crop started at (50, 120) and the entry box starts at (52, 18) within the crop: +- entry_x0 = 52 + 50 = 102 +- entry_top = 18 + 120 = 138 + +**Repeat for each field**, grouping nearby fields into single crops when possible. + +### B.4: Create fields.json with Refined Coordinates + +Create fields.json using `image_width` and `image_height` (signals image coordinates): +```json +{ + "pages": [ + {"page_number": 1, "image_width": 1700, "image_height": 2200} + ], + "form_fields": [ + { + "page_number": 1, + "description": "Last name entry field", + "field_label": "Last Name", + "label_bounding_box": [120, 175, 242, 198], + "entry_bounding_box": [255, 175, 720, 218], + "entry_text": {"text": "Smith", "font_size": 10} + } + ] +} +``` + +**Important**: Use `image_width`/`image_height` and the refined pixel coordinates from the zoom analysis. + +### B.5: Validate Bounding Boxes + +Before filling, check your bounding boxes for errors: +`python scripts/check_bounding_boxes.py fields.json` + +This checks for intersecting bounding boxes and entry boxes that are too small for the font size. Fix any reported errors before filling. + +--- + +## Hybrid Approach: Structure + Visual + +Use this when structure extraction works for most fields but misses some elements (e.g., circular checkboxes, unusual form controls). + +1. **Use Approach A** for fields that were detected in form_structure.json +2. **Convert PDF to images** for visual analysis of missing fields +3. **Use zoom refinement** (from Approach B) for the missing fields +4. **Combine coordinates**: For fields from structure extraction, use `pdf_width`/`pdf_height`. For visually-estimated fields, you must convert image coordinates to PDF coordinates: + - pdf_x = image_x * (pdf_width / image_width) + - pdf_y = image_y * (pdf_height / image_height) +5. **Use a single coordinate system** in fields.json - convert all to PDF coordinates with `pdf_width`/`pdf_height` + +--- + +## Step 2: Validate Before Filling + +**Always validate bounding boxes before filling:** +`python scripts/check_bounding_boxes.py fields.json` + +This checks for: +- Intersecting bounding boxes (which would cause overlapping text) +- Entry boxes that are too small for the specified font size + +Fix any reported errors in fields.json before proceeding. + +## Step 3: Fill the Form + +The fill script auto-detects the coordinate system and handles conversion: +`python scripts/fill_pdf_form_with_annotations.py fields.json ` + +## Step 4: Verify Output + +Convert the filled PDF to images and verify text placement: +`python scripts/convert_pdf_to_images.py ` + +If text is mispositioned: +- **Approach A**: Check that you're using PDF coordinates from form_structure.json with `pdf_width`/`pdf_height` +- **Approach B**: Check that image dimensions match and coordinates are accurate pixels +- **Hybrid**: Ensure coordinate conversions are correct for visually-estimated fields diff --git a/.github/skills/pdf/reference.md b/.github/skills/pdf/reference.md new file mode 100644 index 0000000..41400bf --- /dev/null +++ b/.github/skills/pdf/reference.md @@ -0,0 +1,612 @@ +# PDF Processing Advanced Reference + +This document contains advanced PDF processing features, detailed examples, and additional libraries not covered in the main skill instructions. + +## pypdfium2 Library (Apache/BSD License) + +### Overview +pypdfium2 is a Python binding for PDFium (Chromium's PDF library). It's excellent for fast PDF rendering, image generation, and serves as a PyMuPDF replacement. + +### Render PDF to Images +```python +import pypdfium2 as pdfium +from PIL import Image + +# Load PDF +pdf = pdfium.PdfDocument("document.pdf") + +# Render page to image +page = pdf[0] # First page +bitmap = page.render( + scale=2.0, # Higher resolution + rotation=0 # No rotation +) + +# Convert to PIL Image +img = bitmap.to_pil() +img.save("page_1.png", "PNG") + +# Process multiple pages +for i, page in enumerate(pdf): + bitmap = page.render(scale=1.5) + img = bitmap.to_pil() + img.save(f"page_{i+1}.jpg", "JPEG", quality=90) +``` + +### Extract Text with pypdfium2 +```python +import pypdfium2 as pdfium + +pdf = pdfium.PdfDocument("document.pdf") +for i, page in enumerate(pdf): + text = page.get_text() + print(f"Page {i+1} text length: {len(text)} chars") +``` + +## JavaScript Libraries + +### pdf-lib (MIT License) + +pdf-lib is a powerful JavaScript library for creating and modifying PDF documents in any JavaScript environment. + +#### Load and Manipulate Existing PDF +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function manipulatePDF() { + // Load existing PDF + const existingPdfBytes = fs.readFileSync('input.pdf'); + const pdfDoc = await PDFDocument.load(existingPdfBytes); + + // Get page count + const pageCount = pdfDoc.getPageCount(); + console.log(`Document has ${pageCount} pages`); + + // Add new page + const newPage = pdfDoc.addPage([600, 400]); + newPage.drawText('Added by pdf-lib', { + x: 100, + y: 300, + size: 16 + }); + + // Save modified PDF + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('modified.pdf', pdfBytes); +} +``` + +#### Create Complex PDFs from Scratch +```javascript +import { PDFDocument, rgb, StandardFonts } from 'pdf-lib'; +import fs from 'fs'; + +async function createPDF() { + const pdfDoc = await PDFDocument.create(); + + // Add fonts + const helveticaFont = await pdfDoc.embedFont(StandardFonts.Helvetica); + const helveticaBold = await pdfDoc.embedFont(StandardFonts.HelveticaBold); + + // Add page + const page = pdfDoc.addPage([595, 842]); // A4 size + const { width, height } = page.getSize(); + + // Add text with styling + page.drawText('Invoice #12345', { + x: 50, + y: height - 50, + size: 18, + font: helveticaBold, + color: rgb(0.2, 0.2, 0.8) + }); + + // Add rectangle (header background) + page.drawRectangle({ + x: 40, + y: height - 100, + width: width - 80, + height: 30, + color: rgb(0.9, 0.9, 0.9) + }); + + // Add table-like content + const items = [ + ['Item', 'Qty', 'Price', 'Total'], + ['Widget', '2', '$50', '$100'], + ['Gadget', '1', '$75', '$75'] + ]; + + let yPos = height - 150; + items.forEach(row => { + let xPos = 50; + row.forEach(cell => { + page.drawText(cell, { + x: xPos, + y: yPos, + size: 12, + font: helveticaFont + }); + xPos += 120; + }); + yPos -= 25; + }); + + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('created.pdf', pdfBytes); +} +``` + +#### Advanced Merge and Split Operations +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function mergePDFs() { + // Create new document + const mergedPdf = await PDFDocument.create(); + + // Load source PDFs + const pdf1Bytes = fs.readFileSync('doc1.pdf'); + const pdf2Bytes = fs.readFileSync('doc2.pdf'); + + const pdf1 = await PDFDocument.load(pdf1Bytes); + const pdf2 = await PDFDocument.load(pdf2Bytes); + + // Copy pages from first PDF + const pdf1Pages = await mergedPdf.copyPages(pdf1, pdf1.getPageIndices()); + pdf1Pages.forEach(page => mergedPdf.addPage(page)); + + // Copy specific pages from second PDF (pages 0, 2, 4) + const pdf2Pages = await mergedPdf.copyPages(pdf2, [0, 2, 4]); + pdf2Pages.forEach(page => mergedPdf.addPage(page)); + + const mergedPdfBytes = await mergedPdf.save(); + fs.writeFileSync('merged.pdf', mergedPdfBytes); +} +``` + +### pdfjs-dist (Apache License) + +PDF.js is Mozilla's JavaScript library for rendering PDFs in the browser. + +#### Basic PDF Loading and Rendering +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +// Configure worker (important for performance) +pdfjsLib.GlobalWorkerOptions.workerSrc = './pdf.worker.js'; + +async function renderPDF() { + // Load PDF + const loadingTask = pdfjsLib.getDocument('document.pdf'); + const pdf = await loadingTask.promise; + + console.log(`Loaded PDF with ${pdf.numPages} pages`); + + // Get first page + const page = await pdf.getPage(1); + const viewport = page.getViewport({ scale: 1.5 }); + + // Render to canvas + const canvas = document.createElement('canvas'); + const context = canvas.getContext('2d'); + canvas.height = viewport.height; + canvas.width = viewport.width; + + const renderContext = { + canvasContext: context, + viewport: viewport + }; + + await page.render(renderContext).promise; + document.body.appendChild(canvas); +} +``` + +#### Extract Text with Coordinates +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractText() { + const loadingTask = pdfjsLib.getDocument('document.pdf'); + const pdf = await loadingTask.promise; + + let fullText = ''; + + // Extract text from all pages + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + + const pageText = textContent.items + .map(item => item.str) + .join(' '); + + fullText += `\n--- Page ${i} ---\n${pageText}`; + + // Get text with coordinates for advanced processing + const textWithCoords = textContent.items.map(item => ({ + text: item.str, + x: item.transform[4], + y: item.transform[5], + width: item.width, + height: item.height + })); + } + + console.log(fullText); + return fullText; +} +``` + +#### Extract Annotations and Forms +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractAnnotations() { + const loadingTask = pdfjsLib.getDocument('annotated.pdf'); + const pdf = await loadingTask.promise; + + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const annotations = await page.getAnnotations(); + + annotations.forEach(annotation => { + console.log(`Annotation type: ${annotation.subtype}`); + console.log(`Content: ${annotation.contents}`); + console.log(`Coordinates: ${JSON.stringify(annotation.rect)}`); + }); + } +} +``` + +## Advanced Command-Line Operations + +### poppler-utils Advanced Features + +#### Extract Text with Bounding Box Coordinates +```bash +# Extract text with bounding box coordinates (essential for structured data) +pdftotext -bbox-layout document.pdf output.xml + +# The XML output contains precise coordinates for each text element +``` + +#### Advanced Image Conversion +```bash +# Convert to PNG images with specific resolution +pdftoppm -png -r 300 document.pdf output_prefix + +# Convert specific page range with high resolution +pdftoppm -png -r 600 -f 1 -l 3 document.pdf high_res_pages + +# Convert to JPEG with quality setting +pdftoppm -jpeg -jpegopt quality=85 -r 200 document.pdf jpeg_output +``` + +#### Extract Embedded Images +```bash +# Extract all embedded images with metadata +pdfimages -j -p document.pdf page_images + +# List image info without extracting +pdfimages -list document.pdf + +# Extract images in their original format +pdfimages -all document.pdf images/img +``` + +### qpdf Advanced Features + +#### Complex Page Manipulation +```bash +# Split PDF into groups of pages +qpdf --split-pages=3 input.pdf output_group_%02d.pdf + +# Extract specific pages with complex ranges +qpdf input.pdf --pages input.pdf 1,3-5,8,10-end -- extracted.pdf + +# Merge specific pages from multiple PDFs +qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf +``` + +#### PDF Optimization and Repair +```bash +# Optimize PDF for web (linearize for streaming) +qpdf --linearize input.pdf optimized.pdf + +# Remove unused objects and compress +qpdf --optimize-level=all input.pdf compressed.pdf + +# Attempt to repair corrupted PDF structure +qpdf --check input.pdf +qpdf --fix-qdf damaged.pdf repaired.pdf + +# Show detailed PDF structure for debugging +qpdf --show-all-pages input.pdf > structure.txt +``` + +#### Advanced Encryption +```bash +# Add password protection with specific permissions +qpdf --encrypt user_pass owner_pass 256 --print=none --modify=none -- input.pdf encrypted.pdf + +# Check encryption status +qpdf --show-encryption encrypted.pdf + +# Remove password protection (requires password) +qpdf --password=secret123 --decrypt encrypted.pdf decrypted.pdf +``` + +## Advanced Python Techniques + +### pdfplumber Advanced Features + +#### Extract Text with Precise Coordinates +```python +import pdfplumber + +with pdfplumber.open("document.pdf") as pdf: + page = pdf.pages[0] + + # Extract all text with coordinates + chars = page.chars + for char in chars[:10]: # First 10 characters + print(f"Char: '{char['text']}' at x:{char['x0']:.1f} y:{char['y0']:.1f}") + + # Extract text by bounding box (left, top, right, bottom) + bbox_text = page.within_bbox((100, 100, 400, 200)).extract_text() +``` + +#### Advanced Table Extraction with Custom Settings +```python +import pdfplumber +import pandas as pd + +with pdfplumber.open("complex_table.pdf") as pdf: + page = pdf.pages[0] + + # Extract tables with custom settings for complex layouts + table_settings = { + "vertical_strategy": "lines", + "horizontal_strategy": "lines", + "snap_tolerance": 3, + "intersection_tolerance": 15 + } + tables = page.extract_tables(table_settings) + + # Visual debugging for table extraction + img = page.to_image(resolution=150) + img.save("debug_layout.png") +``` + +### reportlab Advanced Features + +#### Create Professional Reports with Tables +```python +from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.lib import colors + +# Sample data +data = [ + ['Product', 'Q1', 'Q2', 'Q3', 'Q4'], + ['Widgets', '120', '135', '142', '158'], + ['Gadgets', '85', '92', '98', '105'] +] + +# Create PDF with table +doc = SimpleDocTemplate("report.pdf") +elements = [] + +# Add title +styles = getSampleStyleSheet() +title = Paragraph("Quarterly Sales Report", styles['Title']) +elements.append(title) + +# Add table with advanced styling +table = Table(data) +table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 14), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) +])) +elements.append(table) + +doc.build(elements) +``` + +## Complex Workflows + +### Extract Figures/Images from PDF + +#### Method 1: Using pdfimages (fastest) +```bash +# Extract all images with original quality +pdfimages -all document.pdf images/img +``` + +#### Method 2: Using pypdfium2 + Image Processing +```python +import pypdfium2 as pdfium +from PIL import Image +import numpy as np + +def extract_figures(pdf_path, output_dir): + pdf = pdfium.PdfDocument(pdf_path) + + for page_num, page in enumerate(pdf): + # Render high-resolution page + bitmap = page.render(scale=3.0) + img = bitmap.to_pil() + + # Convert to numpy for processing + img_array = np.array(img) + + # Simple figure detection (non-white regions) + mask = np.any(img_array != [255, 255, 255], axis=2) + + # Find contours and extract bounding boxes + # (This is simplified - real implementation would need more sophisticated detection) + + # Save detected figures + # ... implementation depends on specific needs +``` + +### Batch PDF Processing with Error Handling +```python +import os +import glob +from pypdf import PdfReader, PdfWriter +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def batch_process_pdfs(input_dir, operation='merge'): + pdf_files = glob.glob(os.path.join(input_dir, "*.pdf")) + + if operation == 'merge': + writer = PdfWriter() + for pdf_file in pdf_files: + try: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + logger.info(f"Processed: {pdf_file}") + except Exception as e: + logger.error(f"Failed to process {pdf_file}: {e}") + continue + + with open("batch_merged.pdf", "wb") as output: + writer.write(output) + + elif operation == 'extract_text': + for pdf_file in pdf_files: + try: + reader = PdfReader(pdf_file) + text = "" + for page in reader.pages: + text += page.extract_text() + + output_file = pdf_file.replace('.pdf', '.txt') + with open(output_file, 'w', encoding='utf-8') as f: + f.write(text) + logger.info(f"Extracted text from: {pdf_file}") + + except Exception as e: + logger.error(f"Failed to extract text from {pdf_file}: {e}") + continue +``` + +### Advanced PDF Cropping +```python +from pypdf import PdfWriter, PdfReader + +reader = PdfReader("input.pdf") +writer = PdfWriter() + +# Crop page (left, bottom, right, top in points) +page = reader.pages[0] +page.mediabox.left = 50 +page.mediabox.bottom = 50 +page.mediabox.right = 550 +page.mediabox.top = 750 + +writer.add_page(page) +with open("cropped.pdf", "wb") as output: + writer.write(output) +``` + +## Performance Optimization Tips + +### 1. For Large PDFs +- Use streaming approaches instead of loading entire PDF in memory +- Use `qpdf --split-pages` for splitting large files +- Process pages individually with pypdfium2 + +### 2. For Text Extraction +- `pdftotext -bbox-layout` is fastest for plain text extraction +- Use pdfplumber for structured data and tables +- Avoid `pypdf.extract_text()` for very large documents + +### 3. For Image Extraction +- `pdfimages` is much faster than rendering pages +- Use low resolution for previews, high resolution for final output + +### 4. For Form Filling +- pdf-lib maintains form structure better than most alternatives +- Pre-validate form fields before processing + +### 5. Memory Management +```python +# Process PDFs in chunks +def process_large_pdf(pdf_path, chunk_size=10): + reader = PdfReader(pdf_path) + total_pages = len(reader.pages) + + for start_idx in range(0, total_pages, chunk_size): + end_idx = min(start_idx + chunk_size, total_pages) + writer = PdfWriter() + + for i in range(start_idx, end_idx): + writer.add_page(reader.pages[i]) + + # Process chunk + with open(f"chunk_{start_idx//chunk_size}.pdf", "wb") as output: + writer.write(output) +``` + +## Troubleshooting Common Issues + +### Encrypted PDFs +```python +# Handle password-protected PDFs +from pypdf import PdfReader + +try: + reader = PdfReader("encrypted.pdf") + if reader.is_encrypted: + reader.decrypt("password") +except Exception as e: + print(f"Failed to decrypt: {e}") +``` + +### Corrupted PDFs +```bash +# Use qpdf to repair +qpdf --check corrupted.pdf +qpdf --replace-input corrupted.pdf +``` + +### Text Extraction Issues +```python +# Fallback to OCR for scanned PDFs +import pytesseract +from pdf2image import convert_from_path + +def extract_text_with_ocr(pdf_path): + images = convert_from_path(pdf_path) + text = "" + for i, image in enumerate(images): + text += pytesseract.image_to_string(image) + return text +``` + +## License Information + +- **pypdf**: BSD License +- **pdfplumber**: MIT License +- **pypdfium2**: Apache/BSD License +- **reportlab**: BSD License +- **poppler-utils**: GPL-2 License +- **qpdf**: Apache License +- **pdf-lib**: MIT License +- **pdfjs-dist**: Apache License \ No newline at end of file diff --git a/.github/skills/pdf/scripts/check_bounding_boxes.py b/.github/skills/pdf/scripts/check_bounding_boxes.py new file mode 100644 index 0000000..2cc5e34 --- /dev/null +++ b/.github/skills/pdf/scripts/check_bounding_boxes.py @@ -0,0 +1,65 @@ +from dataclasses import dataclass +import json +import sys + + + + +@dataclass +class RectAndField: + rect: list[float] + rect_type: str + field: dict + + +def get_bounding_box_messages(fields_json_stream) -> list[str]: + messages = [] + fields = json.load(fields_json_stream) + messages.append(f"Read {len(fields['form_fields'])} fields") + + def rects_intersect(r1, r2): + disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0] + disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1] + return not (disjoint_horizontal or disjoint_vertical) + + rects_and_fields = [] + for f in fields["form_fields"]: + rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f)) + rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f)) + + has_error = False + for i, ri in enumerate(rects_and_fields): + for j in range(i + 1, len(rects_and_fields)): + rj = rects_and_fields[j] + if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect): + has_error = True + if ri.field is rj.field: + messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})") + else: + messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})") + if len(messages) >= 20: + messages.append("Aborting further checks; fix bounding boxes and try again") + return messages + if ri.rect_type == "entry": + if "entry_text" in ri.field: + font_size = ri.field["entry_text"].get("font_size", 14) + entry_height = ri.rect[3] - ri.rect[1] + if entry_height < font_size: + has_error = True + messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.") + if len(messages) >= 20: + messages.append("Aborting further checks; fix bounding boxes and try again") + return messages + + if not has_error: + messages.append("SUCCESS: All bounding boxes are valid") + return messages + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: check_bounding_boxes.py [fields.json]") + sys.exit(1) + with open(sys.argv[1]) as f: + messages = get_bounding_box_messages(f) + for msg in messages: + print(msg) diff --git a/.github/skills/pdf/scripts/check_fillable_fields.py b/.github/skills/pdf/scripts/check_fillable_fields.py new file mode 100644 index 0000000..36dfb95 --- /dev/null +++ b/.github/skills/pdf/scripts/check_fillable_fields.py @@ -0,0 +1,11 @@ +import sys +from pypdf import PdfReader + + + + +reader = PdfReader(sys.argv[1]) +if (reader.get_fields()): + print("This PDF has fillable form fields") +else: + print("This PDF does not have fillable form fields; you will need to visually determine where to enter data") diff --git a/.github/skills/pdf/scripts/convert_pdf_to_images.py b/.github/skills/pdf/scripts/convert_pdf_to_images.py new file mode 100644 index 0000000..7939cef --- /dev/null +++ b/.github/skills/pdf/scripts/convert_pdf_to_images.py @@ -0,0 +1,33 @@ +import os +import sys + +from pdf2image import convert_from_path + + + + +def convert(pdf_path, output_dir, max_dim=1000): + images = convert_from_path(pdf_path, dpi=200) + + for i, image in enumerate(images): + width, height = image.size + if width > max_dim or height > max_dim: + scale_factor = min(max_dim / width, max_dim / height) + new_width = int(width * scale_factor) + new_height = int(height * scale_factor) + image = image.resize((new_width, new_height)) + + image_path = os.path.join(output_dir, f"page_{i+1}.png") + image.save(image_path) + print(f"Saved page {i+1} as {image_path} (size: {image.size})") + + print(f"Converted {len(images)} pages to PNG images") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: convert_pdf_to_images.py [input pdf] [output directory]") + sys.exit(1) + pdf_path = sys.argv[1] + output_directory = sys.argv[2] + convert(pdf_path, output_directory) diff --git a/.github/skills/pdf/scripts/create_validation_image.py b/.github/skills/pdf/scripts/create_validation_image.py new file mode 100644 index 0000000..10eadd8 --- /dev/null +++ b/.github/skills/pdf/scripts/create_validation_image.py @@ -0,0 +1,37 @@ +import json +import sys + +from PIL import Image, ImageDraw + + + + +def create_validation_image(page_number, fields_json_path, input_path, output_path): + with open(fields_json_path, 'r') as f: + data = json.load(f) + + img = Image.open(input_path) + draw = ImageDraw.Draw(img) + num_boxes = 0 + + for field in data["form_fields"]: + if field["page_number"] == page_number: + entry_box = field['entry_bounding_box'] + label_box = field['label_bounding_box'] + draw.rectangle(entry_box, outline='red', width=2) + draw.rectangle(label_box, outline='blue', width=2) + num_boxes += 2 + + img.save(output_path) + print(f"Created validation image at {output_path} with {num_boxes} bounding boxes") + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage: create_validation_image.py [page number] [fields.json file] [input image path] [output image path]") + sys.exit(1) + page_number = int(sys.argv[1]) + fields_json_path = sys.argv[2] + input_image_path = sys.argv[3] + output_image_path = sys.argv[4] + create_validation_image(page_number, fields_json_path, input_image_path, output_image_path) diff --git a/.github/skills/pdf/scripts/extract_form_field_info.py b/.github/skills/pdf/scripts/extract_form_field_info.py new file mode 100644 index 0000000..64cd470 --- /dev/null +++ b/.github/skills/pdf/scripts/extract_form_field_info.py @@ -0,0 +1,122 @@ +import json +import sys + +from pypdf import PdfReader + + + + +def get_full_annotation_field_id(annotation): + components = [] + while annotation: + field_name = annotation.get('/T') + if field_name: + components.append(field_name) + annotation = annotation.get('/Parent') + return ".".join(reversed(components)) if components else None + + +def make_field_dict(field, field_id): + field_dict = {"field_id": field_id} + ft = field.get('/FT') + if ft == "/Tx": + field_dict["type"] = "text" + elif ft == "/Btn": + field_dict["type"] = "checkbox" + states = field.get("/_States_", []) + if len(states) == 2: + if "/Off" in states: + field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1] + field_dict["unchecked_value"] = "/Off" + else: + print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.") + field_dict["checked_value"] = states[0] + field_dict["unchecked_value"] = states[1] + elif ft == "/Ch": + field_dict["type"] = "choice" + states = field.get("/_States_", []) + field_dict["choice_options"] = [{ + "value": state[0], + "text": state[1], + } for state in states] + else: + field_dict["type"] = f"unknown ({ft})" + return field_dict + + +def get_field_info(reader: PdfReader): + fields = reader.get_fields() + + field_info_by_id = {} + possible_radio_names = set() + + for field_id, field in fields.items(): + if field.get("/Kids"): + if field.get("/FT") == "/Btn": + possible_radio_names.add(field_id) + continue + field_info_by_id[field_id] = make_field_dict(field, field_id) + + + radio_fields_by_id = {} + + for page_index, page in enumerate(reader.pages): + annotations = page.get('/Annots', []) + for ann in annotations: + field_id = get_full_annotation_field_id(ann) + if field_id in field_info_by_id: + field_info_by_id[field_id]["page"] = page_index + 1 + field_info_by_id[field_id]["rect"] = ann.get('/Rect') + elif field_id in possible_radio_names: + try: + on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"] + except KeyError: + continue + if len(on_values) == 1: + rect = ann.get("/Rect") + if field_id not in radio_fields_by_id: + radio_fields_by_id[field_id] = { + "field_id": field_id, + "type": "radio_group", + "page": page_index + 1, + "radio_options": [], + } + radio_fields_by_id[field_id]["radio_options"].append({ + "value": on_values[0], + "rect": rect, + }) + + fields_with_location = [] + for field_info in field_info_by_id.values(): + if "page" in field_info: + fields_with_location.append(field_info) + else: + print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring") + + def sort_key(f): + if "radio_options" in f: + rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0] + else: + rect = f.get("rect") or [0, 0, 0, 0] + adjusted_position = [-rect[1], rect[0]] + return [f.get("page"), adjusted_position] + + sorted_fields = fields_with_location + list(radio_fields_by_id.values()) + sorted_fields.sort(key=sort_key) + + return sorted_fields + + +def write_field_info(pdf_path: str, json_output_path: str): + reader = PdfReader(pdf_path) + field_info = get_field_info(reader) + with open(json_output_path, "w") as f: + json.dump(field_info, f, indent=2) + print(f"Wrote {len(field_info)} fields to {json_output_path}") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: extract_form_field_info.py [input pdf] [output json]") + sys.exit(1) + write_field_info(sys.argv[1], sys.argv[2]) diff --git a/.github/skills/pdf/scripts/extract_form_structure.py b/.github/skills/pdf/scripts/extract_form_structure.py new file mode 100644 index 0000000..f219e7d --- /dev/null +++ b/.github/skills/pdf/scripts/extract_form_structure.py @@ -0,0 +1,115 @@ +""" +Extract form structure from a non-fillable PDF. + +This script analyzes the PDF to find: +- Text labels with their exact coordinates +- Horizontal lines (row boundaries) +- Checkboxes (small rectangles) + +Output: A JSON file with the form structure that can be used to generate +accurate field coordinates for filling. + +Usage: python extract_form_structure.py +""" + +import json +import sys +import pdfplumber + + +def extract_form_structure(pdf_path): + structure = { + "pages": [], + "labels": [], + "lines": [], + "checkboxes": [], + "row_boundaries": [] + } + + with pdfplumber.open(pdf_path) as pdf: + for page_num, page in enumerate(pdf.pages, 1): + structure["pages"].append({ + "page_number": page_num, + "width": float(page.width), + "height": float(page.height) + }) + + words = page.extract_words() + for word in words: + structure["labels"].append({ + "page": page_num, + "text": word["text"], + "x0": round(float(word["x0"]), 1), + "top": round(float(word["top"]), 1), + "x1": round(float(word["x1"]), 1), + "bottom": round(float(word["bottom"]), 1) + }) + + for line in page.lines: + if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5: + structure["lines"].append({ + "page": page_num, + "y": round(float(line["top"]), 1), + "x0": round(float(line["x0"]), 1), + "x1": round(float(line["x1"]), 1) + }) + + for rect in page.rects: + width = float(rect["x1"]) - float(rect["x0"]) + height = float(rect["bottom"]) - float(rect["top"]) + if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2: + structure["checkboxes"].append({ + "page": page_num, + "x0": round(float(rect["x0"]), 1), + "top": round(float(rect["top"]), 1), + "x1": round(float(rect["x1"]), 1), + "bottom": round(float(rect["bottom"]), 1), + "center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1), + "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1) + }) + + lines_by_page = {} + for line in structure["lines"]: + page = line["page"] + if page not in lines_by_page: + lines_by_page[page] = [] + lines_by_page[page].append(line["y"]) + + for page, y_coords in lines_by_page.items(): + y_coords = sorted(set(y_coords)) + for i in range(len(y_coords) - 1): + structure["row_boundaries"].append({ + "page": page, + "row_top": y_coords[i], + "row_bottom": y_coords[i + 1], + "row_height": round(y_coords[i + 1] - y_coords[i], 1) + }) + + return structure + + +def main(): + if len(sys.argv) != 3: + print("Usage: extract_form_structure.py ") + sys.exit(1) + + pdf_path = sys.argv[1] + output_path = sys.argv[2] + + print(f"Extracting structure from {pdf_path}...") + structure = extract_form_structure(pdf_path) + + with open(output_path, "w") as f: + json.dump(structure, f, indent=2) + + print(f"Found:") + print(f" - {len(structure['pages'])} pages") + print(f" - {len(structure['labels'])} text labels") + print(f" - {len(structure['lines'])} horizontal lines") + print(f" - {len(structure['checkboxes'])} checkboxes") + print(f" - {len(structure['row_boundaries'])} row boundaries") + print(f"Saved to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/.github/skills/pdf/scripts/fill_fillable_fields.py b/.github/skills/pdf/scripts/fill_fillable_fields.py new file mode 100644 index 0000000..51c2600 --- /dev/null +++ b/.github/skills/pdf/scripts/fill_fillable_fields.py @@ -0,0 +1,98 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter + +from extract_form_field_info import get_field_info + + + + +def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str): + with open(fields_json_path) as f: + fields = json.load(f) + fields_by_page = {} + for field in fields: + if "value" in field: + field_id = field["field_id"] + page = field["page"] + if page not in fields_by_page: + fields_by_page[page] = {} + fields_by_page[page][field_id] = field["value"] + + reader = PdfReader(input_pdf_path) + + has_error = False + field_info = get_field_info(reader) + fields_by_ids = {f["field_id"]: f for f in field_info} + for field in fields: + existing_field = fields_by_ids.get(field["field_id"]) + if not existing_field: + has_error = True + print(f"ERROR: `{field['field_id']}` is not a valid field ID") + elif field["page"] != existing_field["page"]: + has_error = True + print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})") + else: + if "value" in field: + err = validation_error_for_field_value(existing_field, field["value"]) + if err: + print(err) + has_error = True + if has_error: + sys.exit(1) + + writer = PdfWriter(clone_from=reader) + for page, field_values in fields_by_page.items(): + writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False) + + writer.set_need_appearances_writer(True) + + with open(output_pdf_path, "wb") as f: + writer.write(f) + + +def validation_error_for_field_value(field_info, field_value): + field_type = field_info["type"] + field_id = field_info["field_id"] + if field_type == "checkbox": + checked_val = field_info["checked_value"] + unchecked_val = field_info["unchecked_value"] + if field_value != checked_val and field_value != unchecked_val: + return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"' + elif field_type == "radio_group": + option_values = [opt["value"] for opt in field_info["radio_options"]] + if field_value not in option_values: + return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}' + elif field_type == "choice": + choice_values = [opt["value"] for opt in field_info["choice_options"]] + if field_value not in choice_values: + return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}' + return None + + +def monkeypatch_pydpf_method(): + from pypdf.generic import DictionaryObject + from pypdf.constants import FieldDictionaryAttributes + + original_get_inherited = DictionaryObject.get_inherited + + def patched_get_inherited(self, key: str, default = None): + result = original_get_inherited(self, key, default) + if key == FieldDictionaryAttributes.Opt: + if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result): + result = [r[0] for r in result] + return result + + DictionaryObject.get_inherited = patched_get_inherited + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]") + sys.exit(1) + monkeypatch_pydpf_method() + input_pdf = sys.argv[1] + fields_json = sys.argv[2] + output_pdf = sys.argv[3] + fill_pdf_fields(input_pdf, fields_json, output_pdf) diff --git a/.github/skills/pdf/scripts/fill_pdf_form_with_annotations.py b/.github/skills/pdf/scripts/fill_pdf_form_with_annotations.py new file mode 100644 index 0000000..b430069 --- /dev/null +++ b/.github/skills/pdf/scripts/fill_pdf_form_with_annotations.py @@ -0,0 +1,107 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter +from pypdf.annotations import FreeText + + + + +def transform_from_image_coords(bbox, image_width, image_height, pdf_width, pdf_height): + x_scale = pdf_width / image_width + y_scale = pdf_height / image_height + + left = bbox[0] * x_scale + right = bbox[2] * x_scale + + top = pdf_height - (bbox[1] * y_scale) + bottom = pdf_height - (bbox[3] * y_scale) + + return left, bottom, right, top + + +def transform_from_pdf_coords(bbox, pdf_height): + left = bbox[0] + right = bbox[2] + + pypdf_top = pdf_height - bbox[1] + pypdf_bottom = pdf_height - bbox[3] + + return left, pypdf_bottom, right, pypdf_top + + +def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path): + + with open(fields_json_path, "r") as f: + fields_data = json.load(f) + + reader = PdfReader(input_pdf_path) + writer = PdfWriter() + + writer.append(reader) + + pdf_dimensions = {} + for i, page in enumerate(reader.pages): + mediabox = page.mediabox + pdf_dimensions[i + 1] = [mediabox.width, mediabox.height] + + annotations = [] + for field in fields_data["form_fields"]: + page_num = field["page_number"] + + page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num) + pdf_width, pdf_height = pdf_dimensions[page_num] + + if "pdf_width" in page_info: + transformed_entry_box = transform_from_pdf_coords( + field["entry_bounding_box"], + float(pdf_height) + ) + else: + image_width = page_info["image_width"] + image_height = page_info["image_height"] + transformed_entry_box = transform_from_image_coords( + field["entry_bounding_box"], + image_width, image_height, + float(pdf_width), float(pdf_height) + ) + + if "entry_text" not in field or "text" not in field["entry_text"]: + continue + entry_text = field["entry_text"] + text = entry_text["text"] + if not text: + continue + + font_name = entry_text.get("font", "Arial") + font_size = str(entry_text.get("font_size", 14)) + "pt" + font_color = entry_text.get("font_color", "000000") + + annotation = FreeText( + text=text, + rect=transformed_entry_box, + font=font_name, + font_size=font_size, + font_color=font_color, + border_color=None, + background_color=None, + ) + annotations.append(annotation) + writer.add_annotation(page_number=page_num - 1, annotation=annotation) + + with open(output_pdf_path, "wb") as output: + writer.write(output) + + print(f"Successfully filled PDF form and saved to {output_pdf_path}") + print(f"Added {len(annotations)} text annotations") + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]") + sys.exit(1) + input_pdf = sys.argv[1] + fields_json = sys.argv[2] + output_pdf = sys.argv[3] + + fill_pdf_form(input_pdf, fields_json, output_pdf) diff --git a/.github/skills/power-bi-dax-optimization/SKILL.md b/.github/skills/power-bi-dax-optimization/SKILL.md new file mode 100644 index 0000000..54821f6 --- /dev/null +++ b/.github/skills/power-bi-dax-optimization/SKILL.md @@ -0,0 +1,173 @@ +--- +name: power-bi-dax-optimization +description: 'Comprehensive Power BI DAX formula optimization prompt for improving performance, readability, and maintainability of DAX calculations.' +--- + +# Power BI DAX Formula Optimizer + +You are a Power BI DAX expert specializing in formula optimization. Your goal is to analyze, optimize, and improve DAX formulas for better performance, readability, and maintainability. + +## Analysis Framework + +When provided with a DAX formula, perform this comprehensive analysis: + +### 1. **Performance Analysis** +- Identify expensive operations and calculation patterns +- Look for repeated expressions that can be stored in variables +- Check for inefficient context transitions +- Assess filter complexity and suggest optimizations +- Evaluate aggregation function choices + +### 2. **Readability Assessment** +- Evaluate formula structure and clarity +- Check naming conventions for measures and variables +- Assess comment quality and documentation +- Review logical flow and organization + +### 3. **Best Practices Compliance** +- Verify proper use of variables (VAR statements) +- Check column vs measure reference patterns +- Validate error handling approaches +- Ensure proper function selection (DIVIDE vs /, COUNTROWS vs COUNT) + +### 4. **Maintainability Review** +- Assess formula complexity and modularity +- Check for hard-coded values that should be parameterized +- Evaluate dependency management +- Review reusability potential + +## Optimization Process + +For each DAX formula provided: + +### Step 1: **Current Formula Analysis** +``` +Analyze the provided DAX formula and identify: +- Performance bottlenecks +- Readability issues +- Best practice violations +- Potential errors or edge cases +- Maintenance challenges +``` + +### Step 2: **Optimization Strategy** +``` +Develop optimization approach: +- Variable usage opportunities +- Function replacements for performance +- Context optimization techniques +- Error handling improvements +- Structure reorganization +``` + +### Step 3: **Optimized Formula** +``` +Provide the improved DAX formula with: +- Performance optimizations applied +- Variables for repeated calculations +- Improved readability and structure +- Proper error handling +- Clear commenting and documentation +``` + +### Step 4: **Explanation and Justification** +``` +Explain all changes made: +- Performance improvements and expected impact +- Readability enhancements +- Best practice alignments +- Potential trade-offs or considerations +- Testing recommendations +``` + +## Common Optimization Patterns + +### Performance Optimizations: +- **Variable Usage**: Store expensive calculations in variables +- **Function Selection**: Use COUNTROWS instead of COUNT, SELECTEDVALUE instead of VALUES +- **Context Optimization**: Minimize context transitions in iterator functions +- **Filter Efficiency**: Use table expressions and proper filtering techniques + +### Readability Improvements: +- **Descriptive Variables**: Use meaningful variable names that explain calculations +- **Logical Structure**: Organize complex formulas with clear logical flow +- **Proper Formatting**: Use consistent indentation and line breaks +- **Documentation**: Add comments explaining business logic + +### Error Handling: +- **DIVIDE Function**: Replace division operators with DIVIDE for safety +- **BLANK Handling**: Proper handling of BLANK values without unnecessary conversion +- **Defensive Programming**: Validate inputs and handle edge cases + +## Example Output Format + +```dax +/* +ORIGINAL FORMULA ANALYSIS: +- Performance Issues: [List identified issues] +- Readability Concerns: [List readability problems] +- Best Practice Violations: [List violations] + +OPTIMIZATION STRATEGY: +- [Explain approach and changes] + +PERFORMANCE IMPACT: +- Expected improvement: [Quantify if possible] +- Areas of optimization: [List specific improvements] +*/ + +-- OPTIMIZED FORMULA: +Optimized Measure Name = +VAR DescriptiveVariableName = + CALCULATE( + [Base Measure], + -- Clear filter logic + Table[Column] = "Value" + ) +VAR AnotherCalculation = + DIVIDE( + DescriptiveVariableName, + [Denominator Measure] + ) +RETURN + IF( + ISBLANK(AnotherCalculation), + BLANK(), -- Preserve BLANK behavior + AnotherCalculation + ) +``` + +## Request Instructions + +To use this prompt effectively, provide: + +1. **The DAX formula** you want optimized +2. **Context information** such as: + - Business purpose of the calculation + - Data model relationships involved + - Performance requirements or concerns + - Current performance issues experienced +3. **Specific optimization goals** such as: + - Performance improvement + - Readability enhancement + - Best practice compliance + - Error handling improvement + +## Additional Services + +I can also help with: +- **DAX Pattern Library**: Providing templates for common calculations +- **Performance Benchmarking**: Suggesting testing approaches +- **Alternative Approaches**: Multiple optimization strategies for complex scenarios +- **Model Integration**: How the formula fits with overall model design +- **Documentation**: Creating comprehensive formula documentation + +--- + +**Usage Example:** +"Please optimize this DAX formula for better performance and readability: +```dax +Sales Growth = ([Total Sales] - CALCULATE([Total Sales], PARALLELPERIOD('Date'[Date], -12, MONTH))) / CALCULATE([Total Sales], PARALLELPERIOD('Date'[Date], -12, MONTH)) +``` + +This calculates year-over-year sales growth and is used in several report visuals. Current performance is slow when filtering by multiple dimensions." diff --git a/.github/skills/power-bi-model-design-review/SKILL.md b/.github/skills/power-bi-model-design-review/SKILL.md new file mode 100644 index 0000000..5447d29 --- /dev/null +++ b/.github/skills/power-bi-model-design-review/SKILL.md @@ -0,0 +1,403 @@ +--- +name: power-bi-model-design-review +description: 'Comprehensive Power BI data model design review prompt for evaluating model architecture, relationships, and optimization opportunities.' +--- + +# Power BI Data Model Design Review + +You are a Power BI data modeling expert conducting comprehensive design reviews. Your role is to evaluate model architecture, identify optimization opportunities, and ensure adherence to best practices for scalable, maintainable, and performant data models. + +## Review Framework + +### **Comprehensive Model Assessment** + +When reviewing a Power BI data model, conduct analysis across these key dimensions: + +#### 1. **Schema Architecture Review** +``` +Star Schema Compliance: +□ Clear separation of fact and dimension tables +□ Proper grain consistency within fact tables +□ Dimension tables contain descriptive attributes +□ Minimal snowflaking (justified when present) +□ Appropriate use of bridge tables for many-to-many + +Table Design Quality: +□ Meaningful table and column names +□ Appropriate data types for all columns +□ Proper primary and foreign key relationships +□ Consistent naming conventions +□ Adequate documentation and descriptions +``` + +#### 2. **Relationship Design Evaluation** +``` +Relationship Quality Assessment: +□ Correct cardinality settings (1:*, *:*, 1:1) +□ Appropriate filter directions (single vs. bidirectional) +□ Referential integrity settings optimized +□ Hidden foreign key columns from report view +□ Minimal circular relationship paths + +Performance Considerations: +□ Integer keys preferred over text keys +□ Low-cardinality relationship columns +□ Proper handling of missing/orphaned records +□ Efficient cross-filtering design +□ Minimal many-to-many relationships +``` + +#### 3. **Storage Mode Strategy Review** +``` +Storage Mode Optimization: +□ Import mode used appropriately for small-medium datasets +□ DirectQuery implemented properly for large/real-time data +□ Composite models designed with clear strategy +□ Dual storage mode used effectively for dimensions +□ Hybrid mode applied appropriately for fact tables + +Performance Alignment: +□ Storage modes match performance requirements +□ Data freshness needs properly addressed +□ Cross-source relationships optimized +□ Aggregation strategies implemented where beneficial +``` + +## Detailed Review Process + +### **Phase 1: Model Architecture Analysis** + +#### A. **Schema Design Assessment** +``` +Evaluate Model Structure: + +Fact Table Analysis: +- Grain definition and consistency +- Appropriate measure columns +- Foreign key completeness +- Size and growth projections +- Historical data management + +Dimension Table Analysis: +- Attribute completeness and quality +- Hierarchy design and implementation +- Slowly changing dimension handling +- Surrogate vs. natural key usage +- Reference data management + +Relationship Network Analysis: +- Star vs. snowflake patterns +- Relationship complexity assessment +- Filter propagation paths +- Cross-filtering impact evaluation +``` + +#### B. **Data Quality and Integrity Review** +``` +Data Quality Assessment: + +Completeness: +□ All required business entities represented +□ No missing critical relationships +□ Comprehensive attribute coverage +□ Proper handling of NULL values + +Consistency: +□ Consistent data types across related columns +□ Standardized naming conventions +□ Uniform formatting and encoding +□ Consistent grain across fact tables + +Accuracy: +□ Business rule implementation validation +□ Referential integrity verification +□ Data transformation accuracy +□ Calculated field correctness +``` + +### **Phase 2: Performance and Scalability Review** + +#### A. **Model Size and Efficiency Analysis** +``` +Size Optimization Assessment: + +Data Reduction Opportunities: +- Unnecessary columns identification +- Redundant data elimination +- Historical data archiving needs +- Pre-aggregation possibilities + +Compression Efficiency: +- Data type optimization opportunities +- High-cardinality column assessment +- Calculated column vs. measure usage +- Storage mode selection validation + +Scalability Considerations: +- Growth projection accommodation +- Refresh performance requirements +- Query performance expectations +- Concurrent user capacity planning +``` + +#### B. **Query Performance Analysis** +``` +Performance Pattern Review: + +DAX Optimization: +- Measure efficiency and complexity +- Variable usage in calculations +- Context transition optimization +- Iterator function performance +- Error handling implementation + +Relationship Performance: +- Join efficiency assessment +- Cross-filtering impact analysis +- Many-to-many performance implications +- Bidirectional relationship necessity + +Indexing and Aggregation: +- DirectQuery indexing requirements +- Aggregation table opportunities +- Composite model optimization +- Cache utilization strategies +``` + +### **Phase 3: Maintainability and Governance Review** + +#### A. **Model Maintainability Assessment** +``` +Maintainability Factors: + +Documentation Quality: +□ Table and column descriptions +□ Business rule documentation +□ Data source documentation +□ Relationship justification +□ Measure calculation explanations + +Code Organization: +□ Logical grouping of related measures +□ Consistent naming conventions +□ Modular design principles +□ Clear separation of concerns +□ Version control considerations + +Change Management: +□ Impact assessment procedures +□ Testing and validation processes +□ Deployment and rollback strategies +□ User communication plans +``` + +#### B. **Security and Compliance Review** +``` +Security Implementation: + +Row-Level Security: +□ RLS design and implementation +□ Performance impact assessment +□ Testing and validation completeness +□ Role-based access control +□ Dynamic security patterns + +Data Protection: +□ Sensitive data handling +□ Compliance requirements adherence +□ Audit trail implementation +□ Data retention policies +□ Privacy protection measures +``` + +## Review Output Structure + +### **Executive Summary Template** +``` +Data Model Review Summary + +Model Overview: +- Model name and purpose +- Business domain and scope +- Current size and complexity metrics +- Primary use cases and user groups + +Key Findings: +- Critical issues requiring immediate attention +- Performance optimization opportunities +- Best practice compliance assessment +- Security and governance status + +Priority Recommendations: +1. High Priority: [Critical issues impacting functionality/performance] +2. Medium Priority: [Optimization opportunities with significant benefit] +3. Low Priority: [Best practice improvements and future considerations] + +Implementation Roadmap: +- Quick wins (1-2 weeks) +- Short-term improvements (1-3 months) +- Long-term strategic enhancements (3-12 months) +``` + +### **Detailed Review Report** + +#### **Schema Architecture Section** +``` +1. Table Design Analysis + □ Fact table evaluation and recommendations + □ Dimension table optimization opportunities + □ Relationship design assessment + □ Naming convention compliance + □ Data type optimization suggestions + +2. Performance Architecture + □ Storage mode strategy evaluation + □ Size optimization recommendations + □ Query performance enhancement opportunities + □ Scalability assessment and planning + □ Aggregation and caching strategies + +3. Best Practices Compliance + □ Star schema implementation quality + □ Industry standard adherence + □ Microsoft guidance alignment + □ Documentation completeness + □ Maintenance readiness +``` + +#### **Specific Recommendations** +``` +For Each Issue Identified: + +Issue Description: +- Clear explanation of the problem +- Impact assessment (performance, maintenance, accuracy) +- Risk level and urgency classification + +Recommended Solution: +- Specific steps for resolution +- Alternative approaches when applicable +- Expected benefits and improvements +- Implementation complexity assessment +- Required resources and timeline + +Implementation Guidance: +- Step-by-step instructions +- Code examples where appropriate +- Testing and validation procedures +- Rollback considerations +- Success criteria definition +``` + +## Review Checklist Templates + +### **Quick Assessment Checklist** (30-minute review) +``` +□ Model follows star schema principles +□ Appropriate storage modes selected +□ Relationships have correct cardinality +□ Foreign keys are hidden from report view +□ Date table is properly implemented +□ No circular relationships exist +□ Measure calculations use variables appropriately +□ No unnecessary calculated columns in large tables +□ Table and column names follow conventions +□ Basic documentation is present +``` + +### **Comprehensive Review Checklist** (4-8 hour review) +``` +Architecture & Design: +□ Complete schema architecture analysis +□ Detailed relationship design review +□ Storage mode strategy evaluation +□ Performance optimization assessment +□ Scalability planning review + +Data Quality & Integrity: +□ Comprehensive data quality assessment +□ Referential integrity validation +□ Business rule implementation review +□ Error handling evaluation +□ Data transformation accuracy check + +Performance & Optimization: +□ Query performance analysis +□ DAX optimization opportunities +□ Model size optimization review +□ Refresh performance assessment +□ Concurrent usage capacity planning + +Governance & Security: +□ Security implementation review +□ Documentation quality assessment +□ Maintainability evaluation +□ Compliance requirements check +□ Change management readiness +``` + +## Specialized Review Types + +### **Pre-Production Review** +``` +Focus Areas: +- Functionality completeness +- Performance validation +- Security implementation +- User acceptance criteria +- Go-live readiness assessment + +Deliverables: +- Go/No-go recommendation +- Critical issue resolution plan +- Performance benchmark validation +- User training requirements +- Post-launch monitoring plan +``` + +### **Performance Optimization Review** +``` +Focus Areas: +- Performance bottleneck identification +- Optimization opportunity assessment +- Capacity planning validation +- Scalability improvement recommendations +- Monitoring and alerting setup + +Deliverables: +- Performance improvement roadmap +- Specific optimization recommendations +- Expected performance gains quantification +- Implementation priority matrix +- Success measurement criteria +``` + +### **Modernization Assessment** +``` +Focus Areas: +- Current state vs. best practices gap analysis +- Technology upgrade opportunities +- Architecture improvement possibilities +- Process optimization recommendations +- Skills and training requirements + +Deliverables: +- Modernization strategy and roadmap +- Cost-benefit analysis of improvements +- Risk assessment and mitigation strategies +- Implementation timeline and resource requirements +- Change management recommendations +``` + +--- + +**Usage Instructions:** +To request a data model review, provide: +- Model description and business purpose +- Current architecture overview (tables, relationships) +- Performance requirements and constraints +- Known issues or concerns +- Specific review focus areas or objectives +- Available time/resource constraints for implementation + +I'll conduct a thorough review following this framework and provide specific, actionable recommendations tailored to your model and requirements. diff --git a/.github/skills/power-bi-performance-troubleshooting/SKILL.md b/.github/skills/power-bi-performance-troubleshooting/SKILL.md new file mode 100644 index 0000000..89c04bb --- /dev/null +++ b/.github/skills/power-bi-performance-troubleshooting/SKILL.md @@ -0,0 +1,382 @@ +--- +name: power-bi-performance-troubleshooting +description: 'Systematic Power BI performance troubleshooting prompt for identifying, diagnosing, and resolving performance issues in Power BI models, reports, and queries.' +--- + +# Power BI Performance Troubleshooting Guide + +You are a Power BI performance expert specializing in diagnosing and resolving performance issues across models, reports, and queries. Your role is to provide systematic troubleshooting guidance and actionable solutions. + +## Troubleshooting Methodology + +### Step 1: **Problem Definition and Scope** +Begin by clearly defining the performance issue: + +``` +Issue Classification: +□ Model loading/refresh performance +□ Report page loading performance +□ Visual interaction responsiveness +□ Query execution speed +□ Capacity resource constraints +□ Data source connectivity issues + +Scope Assessment: +□ Affects all users vs. specific users +□ Occurs at specific times vs. consistently +□ Impacts specific reports vs. all reports +□ Happens with certain data filters vs. all scenarios +``` + +### Step 2: **Performance Baseline Collection** +Gather current performance metrics: + +``` +Required Metrics: +- Page load times (target: <10 seconds) +- Visual interaction response (target: <3 seconds) +- Query execution times (target: <30 seconds) +- Model refresh duration (varies by model size) +- Memory and CPU utilization +- Concurrent user load +``` + +### Step 3: **Systematic Diagnosis** +Use this diagnostic framework: + +#### A. **Model Performance Issues** +``` +Data Model Analysis: +✓ Model size and complexity +✓ Relationship design and cardinality +✓ Storage mode configuration (Import/DirectQuery/Composite) +✓ Data types and compression efficiency +✓ Calculated columns vs. measures usage +✓ Date table implementation + +Common Model Issues: +- Large model size due to unnecessary columns/rows +- Inefficient relationships (many-to-many, bidirectional) +- High-cardinality text columns +- Excessive calculated columns +- Missing or improper date tables +- Poor data type selections +``` + +#### B. **DAX Performance Issues** +``` +DAX Formula Analysis: +✓ Complex calculations without variables +✓ Inefficient aggregation functions +✓ Context transition overhead +✓ Iterator function optimization +✓ Filter context complexity +✓ Error handling patterns + +Performance Anti-Patterns: +- Repeated calculations (missing variables) +- FILTER() used as filter argument +- Complex calculated columns in large tables +- Nested CALCULATE functions +- Inefficient time intelligence patterns +``` + +#### C. **Report Design Issues** +``` +Report Performance Analysis: +✓ Number of visuals per page (max 6-8 recommended) +✓ Visual types and complexity +✓ Cross-filtering configuration +✓ Slicer query efficiency +✓ Custom visual performance impact +✓ Mobile layout optimization + +Common Report Issues: +- Too many visuals causing resource competition +- Inefficient cross-filtering patterns +- High-cardinality slicers +- Complex custom visuals +- Poorly optimized visual interactions +``` + +#### D. **Infrastructure and Capacity Issues** +``` +Infrastructure Assessment: +✓ Capacity utilization (CPU, memory, query volume) +✓ Network connectivity and bandwidth +✓ Data source performance +✓ Gateway configuration and performance +✓ Concurrent user load patterns +✓ Geographic distribution considerations + +Capacity Indicators: +- High CPU utilization (>70% sustained) +- Memory pressure warnings +- Query queuing and timeouts +- Gateway performance bottlenecks +- Network latency issues +``` + +## Diagnostic Tools and Techniques + +### **Power BI Desktop Tools** +``` +Performance Analyzer: +- Enable and record visual refresh times +- Identify slowest visuals and operations +- Compare DAX query vs. visual rendering time +- Export results for detailed analysis + +Usage: +1. Open Performance Analyzer pane +2. Start recording +3. Refresh visuals or interact with report +4. Analyze results by duration +5. Focus on highest duration items first +``` + +### **DAX Studio Analysis** +``` +Advanced DAX Analysis: +- Query execution plans +- Storage engine vs. formula engine usage +- Memory consumption patterns +- Query performance metrics +- Server timings analysis + +Key Metrics to Monitor: +- Total duration +- Formula engine duration +- Storage engine duration +- Scan count and efficiency +- Memory usage patterns +``` + +### **Capacity Monitoring** +``` +Fabric Capacity Metrics App: +- CPU and memory utilization trends +- Query volume and patterns +- Refresh performance tracking +- User activity analysis +- Resource bottleneck identification + +Premium Capacity Monitoring: +- Capacity utilization dashboards +- Performance threshold alerts +- Historical trend analysis +- Workload distribution assessment +``` + +## Solution Framework + +### **Immediate Performance Fixes** + +#### Model Optimization: +```dax +-- Replace inefficient patterns: + +❌ Poor Performance: +Sales Growth = +([Total Sales] - CALCULATE([Total Sales], PREVIOUSMONTH('Date'[Date]))) / +CALCULATE([Total Sales], PREVIOUSMONTH('Date'[Date])) + +✅ Optimized Version: +Sales Growth = +VAR CurrentMonth = [Total Sales] +VAR PreviousMonth = CALCULATE([Total Sales], PREVIOUSMONTH('Date'[Date])) +RETURN + DIVIDE(CurrentMonth - PreviousMonth, PreviousMonth) +``` + +#### Report Optimization: +- Reduce visuals per page to 6-8 maximum +- Implement drill-through instead of showing all details +- Use bookmarks for different views instead of multiple visuals +- Apply filters early to reduce data volume +- Optimize slicer selections and cross-filtering + +#### Data Model Optimization: +- Remove unused columns and tables +- Optimize data types (integers vs. text, dates vs. datetime) +- Replace calculated columns with measures where possible +- Implement proper star schema relationships +- Use incremental refresh for large datasets + +### **Advanced Performance Solutions** + +#### Storage Mode Optimization: +``` +Import Mode Optimization: +- Data reduction techniques +- Pre-aggregation strategies +- Incremental refresh implementation +- Compression optimization + +DirectQuery Optimization: +- Database index optimization +- Query folding maximization +- Aggregation table implementation +- Connection pooling configuration + +Composite Model Strategy: +- Strategic storage mode selection +- Cross-source relationship optimization +- Dual mode dimension implementation +- Performance monitoring setup +``` + +#### Infrastructure Scaling: +``` +Capacity Scaling Considerations: +- Vertical scaling (more powerful capacity) +- Horizontal scaling (distributed workload) +- Geographic distribution optimization +- Load balancing implementation + +Gateway Optimization: +- Dedicated gateway clusters +- Load balancing configuration +- Connection optimization +- Performance monitoring setup +``` + +## Troubleshooting Workflows + +### **Quick Win Checklist** (30 minutes) +``` +□ Check Performance Analyzer for obvious bottlenecks +□ Reduce number of visuals on slow-loading pages +□ Apply default filters to reduce data volume +□ Disable unnecessary cross-filtering +□ Check for missing relationships causing cross-joins +□ Verify appropriate storage modes +□ Review and optimize top 3 slowest DAX measures +``` + +### **Comprehensive Analysis** (2-4 hours) +``` +□ Complete model architecture review +□ DAX optimization using variables and efficient patterns +□ Report design optimization and restructuring +□ Data source performance analysis +□ Capacity utilization assessment +□ User access pattern analysis +□ Mobile performance testing +□ Load testing with realistic concurrent users +``` + +### **Strategic Optimization** (1-2 weeks) +``` +□ Complete data model redesign if necessary +□ Implementation of aggregation strategies +□ Infrastructure scaling planning +□ Monitoring and alerting setup +□ User training on efficient usage patterns +□ Performance governance implementation +□ Continuous monitoring and optimization process +``` + +## Performance Monitoring Setup + +### **Proactive Monitoring** +``` +Key Performance Indicators: +- Average page load time by report +- Query execution time percentiles +- Model refresh duration trends +- Capacity utilization patterns +- User adoption and usage metrics +- Error rates and timeout occurrences + +Alerting Thresholds: +- Page load time >15 seconds +- Query execution time >45 seconds +- Capacity CPU >80% for >10 minutes +- Memory utilization >90% +- Refresh failures +- High error rates +``` + +### **Regular Health Checks** +``` +Weekly: +□ Review performance dashboards +□ Check capacity utilization trends +□ Monitor slow-running queries +□ Review user feedback and issues + +Monthly: +□ Comprehensive performance analysis +□ Model optimization opportunities +□ Capacity planning review +□ User training needs assessment + +Quarterly: +□ Strategic performance review +□ Technology updates and optimizations +□ Scaling requirements assessment +□ Performance governance updates +``` + +## Communication and Documentation + +### **Issue Reporting Template** +``` +Performance Issue Report: + +Issue Description: +- What specific performance problem is occurring? +- When does it happen (always, specific times, certain conditions)? +- Who is affected (all users, specific groups, particular reports)? + +Performance Metrics: +- Current performance measurements +- Expected performance targets +- Comparison with previous performance + +Environment Details: +- Report/model names affected +- User locations and network conditions +- Browser and device information +- Capacity and infrastructure details + +Impact Assessment: +- Business impact and urgency +- Number of users affected +- Critical business processes impacted +- Workarounds currently in use +``` + +### **Resolution Documentation** +``` +Solution Summary: +- Root cause analysis results +- Optimization changes implemented +- Performance improvement achieved +- Validation and testing completed + +Implementation Details: +- Step-by-step changes made +- Configuration modifications +- Code changes (DAX, model design) +- Infrastructure adjustments + +Results and Follow-up: +- Before/after performance metrics +- User feedback and validation +- Monitoring setup for ongoing health +- Recommendations for similar issues +``` + +--- + +**Usage Instructions:** +Provide details about your specific Power BI performance issue, including: +- Symptoms and impact description +- Current performance metrics +- Environment and configuration details +- Previous troubleshooting attempts +- Business requirements and constraints + +I'll guide you through systematic diagnosis and provide specific, actionable solutions tailored to your situation. diff --git a/.github/skills/power-bi-report-design-consultation/SKILL.md b/.github/skills/power-bi-report-design-consultation/SKILL.md new file mode 100644 index 0000000..85a520c --- /dev/null +++ b/.github/skills/power-bi-report-design-consultation/SKILL.md @@ -0,0 +1,351 @@ +--- +name: power-bi-report-design-consultation +description: 'Power BI report visualization design prompt for creating effective, user-friendly, and accessible reports with optimal chart selection and layout design.' +--- + +# Power BI Report Visualization Designer + +You are a Power BI visualization and user experience expert specializing in creating effective, accessible, and engaging reports. Your role is to guide the design of reports that clearly communicate insights and enable data-driven decision making. + +## Design Consultation Framework + +### **Initial Requirements Gathering** + +Before recommending visualizations, understand the context: + +``` +Business Context Assessment: +□ What business problem are you trying to solve? +□ Who is the target audience (executives, analysts, operators)? +□ What decisions will this report support? +□ What are the key performance indicators? +□ How will the report be accessed (desktop, mobile, presentation)? + +Data Context Analysis: +□ What data types are involved (categorical, numerical, temporal)? +□ What is the data volume and granularity? +□ Are there hierarchical relationships in the data? +□ What are the most important comparisons or trends? +□ Are there specific drill-down requirements? + +Technical Requirements: +□ Performance constraints and expected load +□ Accessibility requirements +□ Brand guidelines and color restrictions +□ Mobile and responsive design needs +□ Integration with other systems or reports +``` + +### **Chart Selection Methodology** + +#### **Data Relationship Analysis** +``` +Comparison Analysis: +✅ Bar/Column Charts: Comparing categories, ranking items +✅ Horizontal Bars: Long category names, space constraints +✅ Bullet Charts: Performance against targets +✅ Dot Plots: Precise value comparison with minimal ink + +Trend Analysis: +✅ Line Charts: Continuous time series, multiple metrics +✅ Area Charts: Cumulative values, composition over time +✅ Stepped Lines: Discrete changes, status transitions +✅ Sparklines: Inline trend indicators + +Composition Analysis: +✅ Stacked Bars: Parts of whole with comparison +✅ Donut/Pie Charts: Simple composition (max 5-7 categories) +✅ Treemaps: Hierarchical composition, space-efficient +✅ Waterfall: Sequential changes, bridge analysis + +Distribution Analysis: +✅ Histograms: Frequency distribution +✅ Box Plots: Statistical distribution summary +✅ Scatter Plots: Correlation, outlier identification +✅ Heat Maps: Two-dimensional patterns +``` + +#### **Audience-Specific Design Patterns** +``` +Executive Dashboard Design: +- High-level KPIs prominently displayed +- Exception-based highlighting (red/yellow/green) +- Trend indicators with clear direction arrows +- Minimal text, maximum insight density +- Clean, uncluttered design with plenty of white space + +Analytical Report Design: +- Multiple levels of detail with drill-down capability +- Comparative analysis tools (period-over-period) +- Interactive filtering and exploration options +- Detailed data tables when needed +- Comprehensive legends and context information + +Operational Report Design: +- Real-time or near real-time data display +- Action-oriented design with clear status indicators +- Exception-based alerts and notifications +- Mobile-optimized for field use +- Quick refresh and update capabilities +``` + +## Visualization Design Process + +### **Phase 1: Information Architecture** +``` +Content Prioritization: +1. Critical Metrics: Most important KPIs and measures +2. Supporting Context: Trends, comparisons, breakdowns +3. Detailed Analysis: Drill-down data and specifics +4. Navigation & Filters: User control elements + +Layout Strategy: +┌─────────────────────────────────────────┐ +│ Header: Title, Key KPIs, Date Range │ +├─────────────────────────────────────────┤ +│ Primary Insight Area │ +│ ┌─────────────┐ ┌─────────────────────┐│ +│ │ Main │ │ Supporting ││ +│ │ Visual │ │ Context ││ +│ │ │ │ (2-3 smaller ││ +│ │ │ │ visuals) ││ +│ └─────────────┘ └─────────────────────┘│ +├─────────────────────────────────────────┤ +│ Secondary Analysis (Details/Drill-down) │ +├─────────────────────────────────────────┤ +│ Filters & Navigation Controls │ +└─────────────────────────────────────────┘ +``` + +### **Phase 2: Visual Design Specifications** + +#### **Color Strategy Design** +``` +Semantic Color Mapping: +- Green (#2E8B57): Positive performance, on-target, growth +- Red (#DC143C): Negative performance, alerts, below-target +- Blue (#4682B4): Neutral information, base metrics +- Orange (#FF8C00): Warnings, attention needed +- Gray (#708090): Inactive, reference, disabled states + +Accessibility Compliance: +✅ Minimum 4.5:1 contrast ratio for text +✅ Colorblind-friendly palette (avoid red-green only distinctions) +✅ Pattern and shape alternatives to color coding +✅ High contrast mode compatibility +✅ Alternative text for screen readers + +Brand Integration Guidelines: +- Primary brand color for key metrics and headers +- Secondary palette for data categorization +- Neutral grays for backgrounds and borders +- Accent colors for highlights and interactions +``` + +#### **Typography Hierarchy** +``` +Text Size and Weight Guidelines: +- Report Title: 20-24pt, Bold, Brand Font +- Page Titles: 16-18pt, Semi-bold, Sans-serif +- Section Headers: 14-16pt, Semi-bold +- Visual Titles: 12-14pt, Medium weight +- Data Labels: 10-12pt, Regular +- Footnotes/Captions: 9-10pt, Light + +Readability Optimization: +✅ Consistent font family (maximum 2 families) +✅ Sufficient line spacing and letter spacing +✅ Left-aligned text for body content +✅ Centered alignment only for titles +✅ Adequate white space around text elements +``` + +### **Phase 3: Interactive Design** + +#### **Navigation Design Patterns** +``` +Tab Navigation: +Best for: Related content areas, different time periods +Implementation: +- Clear tab labels (max 7 tabs) +- Visual indication of active tab +- Consistent content layout across tabs +- Logical ordering by importance or workflow + +Drill-through Design: +Best for: Detail exploration, context switching +Implementation: +- Clear visual cues for drill-through availability +- Contextual page design with proper filtering +- Back button for easy return navigation +- Consistent styling between levels + +Button Navigation: +Best for: Guided workflows, external links +Implementation: +- Action-oriented button labels +- Consistent styling and sizing +- Appropriate visual hierarchy +- Touch-friendly sizing (minimum 44px) +``` + +#### **Filter and Slicer Design** +``` +Slicer Optimization: +✅ Logical grouping and positioning +✅ Search functionality for high-cardinality fields +✅ Single vs. multi-select based on use case +✅ Clear visual indication of applied filters +✅ Reset/clear all options + +Filter Strategy: +- Page-level filters for common scenarios +- Visual-level filters for specific needs +- Report-level filters for global constraints +- Drill-through filters for detailed analysis +``` + +### **Phase 4: Mobile and Responsive Design** + +#### **Mobile Layout Strategy** +``` +Mobile-First Considerations: +- Portrait orientation as primary design +- Touch-friendly interaction targets (44px minimum) +- Simplified navigation with hamburger menus +- Stacked layout instead of side-by-side +- Larger fonts and increased spacing + +Responsive Visual Selection: +Mobile-Friendly: +✅ Card visuals for KPIs +✅ Simple bar and column charts +✅ Line charts with minimal data points +✅ Large gauge and KPI visuals + +Mobile-Challenging: +❌ Dense matrices and tables +❌ Complex scatter plots +❌ Multi-series area charts +❌ Small multiple visuals +``` + +## Design Review and Validation + +### **Design Quality Checklist** +``` +Visual Clarity: +□ Clear visual hierarchy with appropriate emphasis +□ Sufficient contrast and readability +□ Logical flow and eye movement patterns +□ Minimal cognitive load for interpretation +□ Appropriate use of white space + +Functional Design: +□ All interactions work intuitively +□ Navigation is clear and consistent +□ Filtering behaves as expected +□ Mobile experience is usable +□ Performance is acceptable across devices + +Accessibility Compliance: +□ Screen reader compatibility +□ Keyboard navigation support +□ High contrast compliance +□ Alternative text provided +□ Color is not the only information carrier +``` + +### **User Testing Framework** +``` +Usability Testing Protocol: + +Pre-Test Setup: +- Define test scenarios and tasks +- Prepare realistic test data +- Set up observation and recording +- Brief participants on context + +Test Scenarios: +1. Initial impression and orientation (30 seconds) +2. Finding specific information (2 minutes) +3. Comparing data points (3 minutes) +4. Drilling down for details (2 minutes) +5. Mobile usage simulation (5 minutes) + +Success Criteria: +- Task completion rates >80% +- Time to insight <2 minutes +- User satisfaction scores >4/5 +- No critical usability issues +- Accessibility validation passed +``` + +## Visualization Recommendations Output + +### **Design Specification Template** +``` +Visualization Design Recommendations + +Executive Summary: +- Report purpose and target audience +- Key design principles applied +- Primary visual selections and rationale +- Expected user experience outcomes + +Visual Architecture: +Page 1: Dashboard Overview +├─ Header KPI Cards (4-5 key metrics) +├─ Primary Chart: [Chart Type] showing [Data Story] +├─ Supporting Visuals: [2-3 context charts] +└─ Filter Panel: [Key filter controls] + +Page 2: Detailed Analysis +├─ Comparative Analysis: [Chart selection] +├─ Trend Analysis: [Time-based visuals] +├─ Distribution Analysis: [Statistical charts] +└─ Navigation: Drill-through to operational data + +Interaction Design: +- Cross-filtering strategy +- Drill-through implementation +- Navigation flow design +- Mobile optimization approach +``` + +### **Implementation Guidelines** +``` +Development Priority: +Phase 1 (Week 1): Core dashboard with KPIs and primary visual +Phase 2 (Week 2): Supporting visuals and basic interactions +Phase 3 (Week 3): Advanced interactions and drill-through +Phase 4 (Week 4): Mobile optimization and final polish + +Quality Assurance: +□ Visual accuracy validation +□ Interaction testing across browsers +□ Mobile device testing +□ Accessibility compliance check +□ Performance validation +□ User acceptance testing + +Success Metrics: +- User engagement and adoption rates +- Time to insight measurements +- Decision-making improvement indicators +- User satisfaction feedback +- Performance benchmarks achievement +``` + +--- + +**Usage Instructions:** +To get visualization design recommendations, provide: +- Business context and report objectives +- Target audience and usage scenarios +- Data description and key metrics +- Technical constraints and requirements +- Brand guidelines and accessibility needs +- Specific design challenges or questions + +I'll provide comprehensive design recommendations including chart selection, layout design, interaction patterns, and implementation guidance tailored to your specific needs and context. diff --git a/.github/skills/powerbi-modeling/SKILL.md b/.github/skills/powerbi-modeling/SKILL.md new file mode 100644 index 0000000..b4c4f14 --- /dev/null +++ b/.github/skills/powerbi-modeling/SKILL.md @@ -0,0 +1,153 @@ +--- +name: powerbi-modeling +description: 'Power BI semantic modeling assistant for building optimized data models. Use when working with Power BI semantic models, creating measures, designing star schemas, configuring relationships, implementing RLS, or optimizing model performance. Triggers on queries about DAX calculations, table relationships, dimension/fact table design, naming conventions, model documentation, cardinality, cross-filter direction, calculation groups, and data model best practices. Always connects to the active model first using power-bi-modeling MCP tools to understand the data structure before providing guidance.' +--- + +# Power BI Semantic Modeling + +Guide users in building optimized, well-documented Power BI semantic models following Microsoft best practices. + +## When to Use This Skill + +Use this skill when users ask about: +- Creating or optimizing Power BI semantic models +- Designing star schemas (dimension/fact tables) +- Writing DAX measures or calculated columns +- Configuring table relationships (cardinality, cross-filter) +- Implementing row-level security (RLS) +- Naming conventions for tables, columns, measures +- Adding descriptions and documentation to models +- Performance tuning and optimization +- Calculation groups and field parameters +- Model validation and best practice checks + +**Trigger phrases:** "create a measure", "add relationship", "star schema", "optimize model", "DAX formula", "RLS", "naming convention", "model documentation", "cardinality", "cross-filter" + +## Prerequisites + +### Required Tools +- **Power BI Modeling MCP Server**: Required for connecting to and modifying semantic models + - Enables: connection_operations, table_operations, measure_operations, relationship_operations, etc. + - Must be configured and running to interact with models + +### Optional Dependencies +- **Microsoft Learn MCP Server**: Recommended for researching latest best practices + - Enables: microsoft_docs_search, microsoft_docs_fetch + - Use for complex scenarios, new features, and official documentation + +## Workflow + +### 1. Connect and Analyze First + +Before providing any modeling guidance, always examine the current model state: + +``` +1. List connections: connection_operations(operation: "ListConnections") +2. If no connection, check for local instances: connection_operations(operation: "ListLocalInstances") +3. Connect to the model (Desktop or Fabric) +4. Get model overview: model_operations(operation: "Get") +5. List tables: table_operations(operation: "List") +6. List relationships: relationship_operations(operation: "List") +7. List measures: measure_operations(operation: "List") +``` + +### 2. Evaluate Model Health + +After connecting, assess the model against best practices: + +- **Star Schema**: Are tables properly classified as dimension or fact? +- **Relationships**: Correct cardinality? Minimal bidirectional filters? +- **Naming**: Human-readable, consistent naming conventions? +- **Documentation**: Do tables, columns, measures have descriptions? +- **Measures**: Explicit measures for key calculations? +- **Hidden Fields**: Are technical columns hidden from report view? + +### 3. Provide Targeted Guidance + +Based on analysis, guide improvements using references: +- Star schema design: See [STAR-SCHEMA.md](references/STAR-SCHEMA.md) +- Relationship configuration: See [RELATIONSHIPS.md](references/RELATIONSHIPS.md) +- DAX measures and naming: See [MEASURES-DAX.md](references/MEASURES-DAX.md) +- Performance optimization: See [PERFORMANCE.md](references/PERFORMANCE.md) +- Row-level security: See [RLS.md](references/RLS.md) + +## Quick Reference: Model Quality Checklist + +| Area | Best Practice | +|------|--------------| +| Tables | Clear dimension vs fact classification | +| Naming | Human-readable: `Customer Name` not `CUST_NM` | +| Descriptions | All tables, columns, measures documented | +| Measures | Explicit DAX measures for business metrics | +| Relationships | One-to-many from dimension to fact | +| Cross-filter | Single direction unless specifically needed | +| Hidden fields | Hide technical keys, IDs from report view | +| Date table | Dedicated marked date table | + +## MCP Tools Reference + +Use these Power BI Modeling MCP operations: + +| Operation Category | Key Operations | +|-------------------|----------------| +| `connection_operations` | Connect, ListConnections, ListLocalInstances, ConnectFabric | +| `model_operations` | Get, GetStats, ExportTMDL | +| `table_operations` | List, Get, Create, Update, GetSchema | +| `column_operations` | List, Get, Create, Update (descriptions, hidden, format) | +| `measure_operations` | List, Get, Create, Update, Move | +| `relationship_operations` | List, Get, Create, Update, Activate, Deactivate | +| `dax_query_operations` | Execute, Validate | +| `calculation_group_operations` | List, Create, Update | +| `security_role_operations` | List, Create, Update, GetEffectivePermissions | + +## Common Tasks + +### Add Measure with Description +``` +measure_operations( + operation: "Create", + definitions: [{ + name: "Total Sales", + tableName: "Sales", + expression: "SUM(Sales[Amount])", + formatString: "$#,##0", + description: "Sum of all sales amounts" + }] +) +``` + +### Update Column Description +``` +column_operations( + operation: "Update", + definitions: [{ + tableName: "Customer", + name: "CustomerKey", + description: "Unique identifier for customer dimension", + isHidden: true + }] +) +``` + +### Create Relationship +``` +relationship_operations( + operation: "Create", + definitions: [{ + fromTable: "Sales", + fromColumn: "CustomerKey", + toTable: "Customer", + toColumn: "CustomerKey", + crossFilteringBehavior: "OneDirection" + }] +) +``` + +## When to Use Microsoft Learn MCP + +Research current best practices using `microsoft_docs_search` for: +- Latest DAX function documentation +- New Power BI features and capabilities +- Complex modeling scenarios (SCD Type 2, many-to-many) +- Performance optimization techniques +- Security implementation patterns diff --git a/.github/skills/powerbi-modeling/references/MEASURES-DAX.md b/.github/skills/powerbi-modeling/references/MEASURES-DAX.md new file mode 100644 index 0000000..2bd6ff0 --- /dev/null +++ b/.github/skills/powerbi-modeling/references/MEASURES-DAX.md @@ -0,0 +1,195 @@ +# DAX Measures and Naming Conventions + +## Naming Conventions + +### General Rules +- Use human-readable names (spaces allowed) +- Be descriptive: `Total Sales Amount` not `TSA` +- Avoid abbreviations unless universally understood +- Use consistent capitalization (Title Case recommended) +- Avoid special characters except spaces + +### Table Naming +| Type | Convention | Example | +|------|------------|---------| +| Dimension | Singular noun | Customer, Product, Date | +| Fact | Business process | Sales, Orders, Inventory | +| Bridge | Combined names | CustomerAccount, ProductCategory | +| Measure Table | Underscore prefix | _Measures, _KPIs | + +### Column Naming +| Type | Convention | Example | +|------|------------|---------| +| Keys | Suffix with "Key" or "ID" | CustomerKey, ProductID | +| Dates | Suffix with "Date" | OrderDate, ShipDate | +| Amounts | Descriptive with unit hint | SalesAmount, QuantitySold | +| Flags | Prefix with "Is" or "Has" | IsActive, HasDiscount | + +### Measure Naming +| Type | Convention | Example | +|------|------------|---------| +| Aggregations | Verb + Noun | Total Sales, Count of Orders | +| Ratios | X per Y or X Rate | Sales per Customer, Conversion Rate | +| Time Intelligence | Period + Metric | YTD Sales, PY Total Sales | +| Comparisons | Metric + vs + Baseline | Sales vs Budget, Growth vs PY | + +## Explicit vs Implicit Measures + +### Always Create Explicit Measures For: +1. Key business metrics users will query +2. Complex calculations with filter manipulation +3. Measures used in MDX (Excel PivotTables) +4. Controlled aggregation (prevent sum of averages) + +### Implicit Measures (Column Aggregations) +- Acceptable for simple exploration +- Set correct SummarizeBy property: + - Amounts: Sum + - Keys/IDs: None (Do Not Summarize) + - Rates/Prices: None or Average + +## Measure Patterns + +### Basic Aggregations +```dax +Total Sales = SUM(Sales[SalesAmount]) +Order Count = COUNTROWS(Sales) +Average Order Value = DIVIDE([Total Sales], [Order Count]) +Distinct Customers = DISTINCTCOUNT(Sales[CustomerKey]) +``` + +### Time Intelligence (Requires Date Table) +```dax +YTD Sales = TOTALYTD([Total Sales], 'Date'[Date]) +MTD Sales = TOTALMTD([Total Sales], 'Date'[Date]) +PY Sales = CALCULATE([Total Sales], SAMEPERIODLASTYEAR('Date'[Date])) +YoY Growth = DIVIDE([Total Sales] - [PY Sales], [PY Sales]) +``` + +### Percentage Calculations +```dax +Sales % of Total = +DIVIDE( + [Total Sales], + CALCULATE([Total Sales], REMOVEFILTERS(Product)) +) + +Margin % = DIVIDE([Gross Profit], [Total Sales]) +``` + +### Running Totals +```dax +Running Total = +CALCULATE( + [Total Sales], + FILTER( + ALL('Date'), + 'Date'[Date] <= MAX('Date'[Date]) + ) +) +``` + +## Column References + +### Best Practice: Always Qualify Column Names +```dax +// GOOD - Fully qualified +Sales Amount = SUM(Sales[SalesAmount]) + +// BAD - Unqualified (can cause ambiguity) +Sales Amount = SUM([SalesAmount]) +``` + +### Measure References: Never Qualify +```dax +// GOOD - Unqualified measure +YTD Sales = TOTALYTD([Total Sales], 'Date'[Date]) + +// BAD - Qualified measure (breaks if home table changes) +YTD Sales = TOTALYTD(Sales[Total Sales], 'Date'[Date]) +``` + +## Documentation + +### Measure Descriptions +Always add descriptions explaining: +- What the measure calculates +- Business context/usage +- Any important assumptions + +``` +measure_operations( + operation: "Update", + definitions: [{ + name: "Total Sales", + tableName: "Sales", + description: "Sum of all completed sales transactions. Excludes returns and cancelled orders." + }] +) +``` + +### Format Strings +| Data Type | Format String | Example Output | +|-----------|---------------|----------------| +| Currency | $#,##0.00 | $1,234.56 | +| Percentage | 0.0% | 12.3% | +| Whole Number | #,##0 | 1,234 | +| Decimal | #,##0.00 | 1,234.56 | + +## Display Folders + +Organize measures into logical groups: +``` +measure_operations( + operation: "Update", + definitions: [{ + name: "YTD Sales", + tableName: "_Measures", + displayFolder: "Time Intelligence\\Year" + }] +) +``` + +Common folder structure: +``` +_Measures +├── Sales +│ ├── Total Sales +│ └── Average Sale +├── Time Intelligence +│ ├── Year +│ │ ├── YTD Sales +│ │ └── PY Sales +│ └── Month +│ └── MTD Sales +└── Ratios + ├── Margin % + └── Conversion Rate +``` + +## Variables for Performance + +Use variables to: +- Avoid recalculating the same expression +- Improve readability +- Enable debugging + +```dax +Gross Margin % = +VAR TotalSales = [Total Sales] +VAR TotalCost = [Total Cost] +VAR GrossProfit = TotalSales - TotalCost +RETURN + DIVIDE(GrossProfit, TotalSales) +``` + +## Validation Checklist + +- [ ] All key business metrics have explicit measures +- [ ] Measures have clear, descriptive names +- [ ] Measures have descriptions +- [ ] Appropriate format strings applied +- [ ] Display folders organize related measures +- [ ] Column references are fully qualified +- [ ] Measure references are not qualified +- [ ] Variables used for complex calculations diff --git a/.github/skills/powerbi-modeling/references/PERFORMANCE.md b/.github/skills/powerbi-modeling/references/PERFORMANCE.md new file mode 100644 index 0000000..c7a3428 --- /dev/null +++ b/.github/skills/powerbi-modeling/references/PERFORMANCE.md @@ -0,0 +1,215 @@ +# Performance Optimization for Power BI Models + +## Data Reduction Techniques + +### 1. Remove Unnecessary Columns +- Only import columns needed for reporting +- Remove audit columns (CreatedBy, ModifiedDate) unless required +- Remove duplicate/redundant columns + +``` +column_operations(operation: "List", filter: { tableNames: ["Sales"] }) +// Review and remove unneeded columns +``` + +### 2. Remove Unnecessary Rows +- Filter historical data to relevant period +- Exclude cancelled/void transactions if not needed +- Apply filters in Power Query (not in DAX) + +### 3. Reduce Cardinality +High cardinality (many unique values) impacts: +- Model size +- Refresh time +- Query performance + +**Solutions:** +| Column Type | Reduction Technique | +|-------------|---------------------| +| DateTime | Split into Date and Time columns | +| Decimal precision | Round to needed precision | +| Text with patterns | Extract common prefix/suffix | +| High-precision IDs | Use surrogate integer keys | + +### 4. Optimize Data Types +| From | To | Benefit | +|------|-----|---------| +| DateTime | Date (if time not needed) | 8 bytes to 4 bytes | +| Decimal | Fixed Decimal | Better compression | +| Text with numbers | Whole Number | Much better compression | +| Long text | Shorter text | Reduces storage | + +### 5. Group and Summarize +Pre-aggregate data when detail not needed: +- Daily instead of transactional +- Monthly instead of daily +- Consider aggregation tables for DirectQuery + +## Column Optimization + +### Prefer Power Query Columns Over Calculated Columns +| Approach | When to Use | +|----------|-------------| +| Power Query (M) | Can be computed at source, static values | +| Calculated Column (DAX) | Needs model relationships, dynamic logic | + +Power Query columns: +- Load faster +- Compress better +- Use less memory + +### Avoid Calculated Columns on Relationship Keys +DAX calculated columns in relationships: +- Cannot use indexes +- Generate complex SQL for DirectQuery +- Hurt performance significantly + +**Use COMBINEVALUES for multi-column relationships:** +```dax +// If you must use calculated column for composite key +CompositeKey = COMBINEVALUES(",", [Country], [City]) +``` + +### Set Appropriate Summarization +Prevent accidental aggregation of non-additive columns: +``` +column_operations( + operation: "Update", + definitions: [{ + tableName: "Product", + name: "UnitPrice", + summarizeBy: "None" + }] +) +``` + +## Relationship Optimization + +### 1. Minimize Bidirectional Relationships +Each bidirectional relationship: +- Increases query complexity +- Can create ambiguous paths +- Reduces performance + +### 2. Avoid Many-to-Many When Possible +Many-to-many relationships: +- Generate more complex queries +- Require more memory +- Can produce unexpected results + +### 3. Reduce Relationship Cardinality +Keep relationship columns low cardinality: +- Use integer keys over text +- Consider higher-grain relationships + +## DAX Optimization + +### 1. Use Variables +```dax +// GOOD - Calculate once, use twice +Sales Growth = +VAR CurrentSales = [Total Sales] +VAR PriorSales = [PY Sales] +RETURN DIVIDE(CurrentSales - PriorSales, PriorSales) + +// BAD - Recalculates [Total Sales] and [PY Sales] +Sales Growth = +DIVIDE([Total Sales] - [PY Sales], [PY Sales]) +``` + +### 2. Avoid FILTER with Entire Tables +```dax +// BAD - Iterates entire table +Sales High Value = +CALCULATE([Total Sales], FILTER(Sales, Sales[Amount] > 1000)) + +// GOOD - Uses column reference +Sales High Value = +CALCULATE([Total Sales], Sales[Amount] > 1000) +``` + +### 3. Use KEEPFILTERS Appropriately +```dax +// Respects existing filters +Sales with Filter = +CALCULATE([Total Sales], KEEPFILTERS(Product[Category] = "Bikes")) +``` + +### 4. Prefer DIVIDE Over Division Operator +```dax +// GOOD - Handles divide by zero +Margin % = DIVIDE([Profit], [Sales]) + +// BAD - Errors on zero +Margin % = [Profit] / [Sales] +``` + +## DirectQuery Optimization + +### 1. Minimize Columns and Tables +DirectQuery models: +- Query source for every visual +- Performance depends on source +- Minimize data retrieved + +### 2. Avoid Complex Power Query Transformations +- Transforms become subqueries +- Native queries are faster +- Materialize at source when possible + +### 3. Keep Measures Simple Initially +Complex DAX generates complex SQL: +- Start with basic aggregations +- Add complexity gradually +- Monitor query performance + +### 4. Disable Auto Date/Time +For DirectQuery models, disable auto date/time: +- Creates hidden calculated tables +- Increases model complexity +- Use explicit date table instead + +## Aggregations + +### User-Defined Aggregations +Pre-aggregate fact tables for: +- Very large models (billions of rows) +- Hybrid DirectQuery/Import +- Common query patterns + +``` +table_operations( + operation: "Create", + definitions: [{ + name: "SalesAgg", + mode: "Import", + mExpression: "..." + }] +) +``` + +## Performance Testing + +### Use Performance Analyzer +1. Enable in Power BI Desktop +2. Start recording +3. Interact with visuals +4. Review DAX query times + +### Monitor with DAX Studio +External tool for: +- Query timing +- Server timings +- Query plans + +## Validation Checklist + +- [ ] Unnecessary columns removed +- [ ] Appropriate data types used +- [ ] High-cardinality columns addressed +- [ ] Bidirectional relationships minimized +- [ ] DAX uses variables for repeated expressions +- [ ] No FILTER on entire tables +- [ ] DIVIDE used instead of division operator +- [ ] Auto date/time disabled for DirectQuery +- [ ] Performance tested with representative data diff --git a/.github/skills/powerbi-modeling/references/RELATIONSHIPS.md b/.github/skills/powerbi-modeling/references/RELATIONSHIPS.md new file mode 100644 index 0000000..a902b53 --- /dev/null +++ b/.github/skills/powerbi-modeling/references/RELATIONSHIPS.md @@ -0,0 +1,147 @@ +# Relationships in Power BI + +## Relationship Properties + +### Cardinality +| Type | Use Case | Notes | +|------|----------|-------| +| One-to-Many (*:1) | Dimension to Fact | Most common, preferred | +| Many-to-One (1:*) | Fact to Dimension | Same as above, direction reversed | +| One-to-One (1:1) | Dimension extensions | Use sparingly | +| Many-to-Many (*:*) | Bridge tables, complex scenarios | Requires careful design | + +### Cross-Filter Direction +| Setting | Behavior | When to Use | +|---------|----------|-------------| +| Single | Filters flow from "one" to "many" | Default, best performance | +| Both | Filters flow in both directions | Only when necessary | + +## Best Practices + +### 1. Prefer One-to-Many Relationships +``` +Customer (1) --> (*) Sales +Product (1) --> (*) Sales +Date (1) --> (*) Sales +``` + +### 2. Use Single-Direction Cross-Filtering +Bidirectional filtering: +- Impacts performance negatively +- Can create ambiguous filter paths +- May produce unexpected results + +**Only use bidirectional when:** +- Dimension-to-dimension analysis through fact table +- Specific RLS requirements + +**Better alternative:** Use CROSSFILTER in DAX measures: +```dax +Countries Sold = +CALCULATE( + DISTINCTCOUNT(Customer[Country]), + CROSSFILTER(Customer[CustomerKey], Sales[CustomerKey], BOTH) +) +``` + +### 3. One Active Path Between Tables +- Only one active relationship between any two tables +- Use USERELATIONSHIP for role-playing dimensions: + +```dax +Sales by Ship Date = +CALCULATE( + [Total Sales], + USERELATIONSHIP(Sales[ShipDate], Date[Date]) +) +``` + +### 4. Avoid Ambiguous Paths +Circular references cause errors. Solutions: +- Deactivate one relationship +- Restructure model +- Use USERELATIONSHIP in measures + +## Relationship Patterns + +### Standard Star Schema +``` + [Date] + | +[Product]--[Sales]--[Customer] + | + [Store] +``` + +### Role-Playing Dimension +``` +[Date] --(active)-- [Sales.OrderDate] + | + +--(inactive)-- [Sales.ShipDate] +``` + +### Bridge Table (Many-to-Many) +``` +[Customer]--(*)--[CustomerAccount]--(*)--[Account] +``` + +### Factless Fact Table +``` +[Product]--[ProductPromotion]--[Promotion] +``` +Used to capture relationships without measures. + +## Creating Relationships via MCP + +### List Current Relationships +``` +relationship_operations(operation: "List") +``` + +### Create New Relationship +``` +relationship_operations( + operation: "Create", + definitions: [{ + fromTable: "Sales", + fromColumn: "ProductKey", + toTable: "Product", + toColumn: "ProductKey", + crossFilteringBehavior: "OneDirection", + isActive: true + }] +) +``` + +### Deactivate Relationship +``` +relationship_operations( + operation: "Deactivate", + references: [{ name: "relationship-guid-here" }] +) +``` + +## Troubleshooting + +### "Ambiguous Path" Error +Multiple active paths exist between tables. +- Check for: Multiple fact tables sharing dimensions +- Solution: Deactivate redundant relationships + +### Bidirectional Not Allowed +Circular reference would be created. +- Solution: Restructure or use DAX CROSSFILTER + +### Relationship Not Detected +Columns may have different data types. +- Ensure both columns have identical types +- Check for trailing spaces in text keys + +## Validation Checklist + +- [ ] All relationships are one-to-many where possible +- [ ] Cross-filter is single direction by default +- [ ] Only one active path between any two tables +- [ ] Role-playing dimensions use inactive relationships +- [ ] No circular reference paths +- [ ] Key columns have matching data types diff --git a/.github/skills/powerbi-modeling/references/RLS.md b/.github/skills/powerbi-modeling/references/RLS.md new file mode 100644 index 0000000..b862e12 --- /dev/null +++ b/.github/skills/powerbi-modeling/references/RLS.md @@ -0,0 +1,226 @@ +# Row-Level Security (RLS) in Power BI + +## Overview + +Row-Level Security restricts data access at the row level based on user identity. Users see only the data they're authorized to view. + +## Design Principles + +### 1. Filter on Dimension Tables +Apply RLS to dimensions, not fact tables: +- More efficient (smaller tables) +- Filters propagate through relationships +- Easier to maintain + +```dax +// On Customer dimension - filters propagate to Sales +[Region] = "West" +``` + +### 2. Create Minimal Roles +Avoid many role combinations: +- Each role = separate cache +- Roles are additive (union, not intersection) +- Consolidate where possible + +### 3. Use Dynamic RLS When Possible +Data-driven rules scale better: +- User mapping in a table +- USERPRINCIPALNAME() for identity +- No role changes when users change + +## Static vs Dynamic RLS + +### Static RLS +Fixed rules per role: +```dax +// Role: West Region +[Region] = "West" + +// Role: East Region +[Region] = "East" +``` + +**Pros:** Simple, clear +**Cons:** Doesn't scale, requires role per group + +### Dynamic RLS +User identity drives filtering: +```dax +// Single role filters based on logged-in user +[ManagerEmail] = USERPRINCIPALNAME() +``` + +**Pros:** Scales, self-maintaining +**Cons:** Requires user mapping data + +## Implementation Patterns + +### Pattern 1: Direct User Mapping +User email in dimension table: +```dax +// On Customer table +[CustomerEmail] = USERPRINCIPALNAME() +``` + +### Pattern 2: Security Table +Separate table mapping users to data: +``` +SecurityMapping table: +| UserEmail | Region | +|-----------|--------| +| joe@co.com | West | +| sue@co.com | East | +``` + +```dax +// On Region dimension +[Region] IN + SELECTCOLUMNS( + FILTER(SecurityMapping, [UserEmail] = USERPRINCIPALNAME()), + "Region", [Region] + ) +``` + +### Pattern 3: Manager Hierarchy +Users see their data plus subordinates: +```dax +// Using PATH functions for hierarchy +PATHCONTAINS(Employee[ManagerPath], + LOOKUPVALUE(Employee[EmployeeID], Employee[Email], USERPRINCIPALNAME())) +``` + +### Pattern 4: Multiple Rules +Combine conditions: +```dax +// Users see their region OR if they're a global viewer +[Region] = LOOKUPVALUE(Users[Region], Users[Email], USERPRINCIPALNAME()) +|| LOOKUPVALUE(Users[IsGlobal], Users[Email], USERPRINCIPALNAME()) = TRUE() +``` + +## Creating Roles via MCP + +### List Existing Roles +``` +security_role_operations(operation: "List") +``` + +### Create Role with Permission +``` +security_role_operations( + operation: "Create", + definitions: [{ + name: "Regional Sales", + modelPermission: "Read", + description: "Restricts sales data by region" + }] +) +``` + +### Add Table Permission (Filter) +``` +security_role_operations( + operation: "CreatePermissions", + permissionDefinitions: [{ + roleName: "Regional Sales", + tableName: "Customer", + filterExpression: "[Region] = USERPRINCIPALNAME()" + }] +) +``` + +### Get Effective Permissions +``` +security_role_operations( + operation: "GetEffectivePermissions", + references: [{ name: "Regional Sales" }] +) +``` + +## Testing RLS + +### In Power BI Desktop +1. Modeling tab > View As +2. Select role(s) to test +3. Optionally specify user identity +4. Verify data filtering + +### Test Unexpected Values +For dynamic RLS, test: +- Valid users +- Unknown users (should see nothing or error gracefully) +- NULL/blank values + +```dax +// Defensive pattern - returns no data for unknown users +IF( + USERPRINCIPALNAME() IN VALUES(SecurityMapping[UserEmail]), + [Region] IN SELECTCOLUMNS(...), + FALSE() +) +``` + +## Common Mistakes + +### 1. RLS on Fact Tables Only +**Problem:** Large table scans, poor performance +**Solution:** Apply to dimension tables, let relationships propagate + +### 2. Using LOOKUPVALUE Instead of Relationships +**Problem:** Expensive, doesn't scale +**Solution:** Create proper relationships, let filters flow + +### 3. Expecting Intersection Behavior +**Problem:** Multiple roles = UNION (additive), not intersection +**Solution:** Design roles with union behavior in mind + +### 4. Forgetting About DirectQuery +**Problem:** RLS filters become WHERE clauses +**Solution:** Ensure source database can handle the query patterns + +### 5. Not Testing Edge Cases +**Problem:** Users see unexpected data +**Solution:** Test with: valid users, invalid users, multiple roles + +## Bidirectional RLS + +For bidirectional relationships with RLS: +``` +Enable "Apply security filter in both directions" +``` + +Only use when: +- RLS requires filtering through many-to-many +- Dimension-to-dimension security needed + +**Caution:** Only one bidirectional relationship per path allowed. + +## Performance Considerations + +- RLS adds WHERE clauses to every query +- Complex DAX in filters hurts performance +- Test with realistic user counts +- Consider aggregations for large models + +## Object-Level Security (OLS) + +Restrict access to entire tables or columns: +``` +// Via XMLA/TMSL - not available in Desktop UI +``` + +Use for: +- Hiding sensitive columns (salary, SSN) +- Restricting entire tables +- Combined with RLS for comprehensive security + +## Validation Checklist + +- [ ] RLS applied to dimension tables (not fact tables) +- [ ] Filters propagate correctly through relationships +- [ ] Dynamic RLS uses USERPRINCIPALNAME() +- [ ] Tested with valid and invalid users +- [ ] Edge cases handled (NULL, unknown users) +- [ ] Performance tested under load +- [ ] Role mappings documented +- [ ] Workspace roles understood (Admins bypass RLS) diff --git a/.github/skills/powerbi-modeling/references/STAR-SCHEMA.md b/.github/skills/powerbi-modeling/references/STAR-SCHEMA.md new file mode 100644 index 0000000..e5a744a --- /dev/null +++ b/.github/skills/powerbi-modeling/references/STAR-SCHEMA.md @@ -0,0 +1,103 @@ +# Star Schema Design for Power BI + +## Overview + +Star schema is the optimal design pattern for Power BI semantic models. It organizes data into: +- **Dimension tables**: Enable filtering and grouping (the "one" side) +- **Fact tables**: Enable summarization (the "many" side) + +## Table Classification + +### Dimension Tables +- Contain descriptive attributes for filtering/slicing +- Have unique key columns (one row per entity) +- Examples: Customer, Product, Date, Geography, Employee +- Naming convention: Singular noun (`Customer`, `Product`) + +### Fact Tables +- Contain measurable, quantitative data +- Have foreign keys to dimensions +- Store data at consistent grain (one row per transaction/event) +- Examples: Sales, Orders, Inventory, WebVisits +- Naming convention: Business process noun (`Sales`, `Orders`) + +## Design Principles + +### 1. Separate Dimensions from Facts +``` +BAD: Single denormalized "Sales" table with customer details +GOOD: "Sales" fact table + "Customer" dimension table +``` + +### 2. Consistent Grain +Every row in a fact table represents the same thing: +- Order line level (most common) +- Daily aggregation +- Monthly summary + +Never mix grains in one table. + +### 3. Surrogate Keys +Add surrogate keys when source lacks unique identifiers: +```m +// Power Query: Add index column += Table.AddIndexColumn(Source, "CustomerKey", 1, 1) +``` + +### 4. Date Dimension +Always create a dedicated date table: +- Mark as date table in Power BI +- Include fiscal periods if needed +- Add relative date columns (IsCurrentMonth, IsPreviousYear) + +```dax +Date = +ADDCOLUMNS( + CALENDAR(DATE(2020,1,1), DATE(2030,12,31)), + "Year", YEAR([Date]), + "Month", FORMAT([Date], "MMMM"), + "MonthNum", MONTH([Date]), + "Quarter", "Q" & FORMAT([Date], "Q"), + "WeekDay", FORMAT([Date], "dddd") +) +``` + +## Special Dimension Types + +### Role-Playing Dimensions +Same dimension used multiple times (e.g., Date for OrderDate, ShipDate): +- Option 1: Duplicate the table (OrderDate, ShipDate tables) +- Option 2: Use inactive relationships with USERELATIONSHIP in DAX + +### Slowly Changing Dimensions (Type 2) +Track historical changes with version columns: +- StartDate, EndDate columns +- IsCurrent flag +- Requires pre-processing in data warehouse + +### Junk Dimensions +Combine low-cardinality flags into one table: +``` +OrderFlags dimension: IsRush, IsGift, IsOnline +``` + +### Degenerate Dimensions +Keep transaction identifiers (OrderNumber, InvoiceID) in fact table. + +## Anti-Patterns to Avoid + +| Anti-Pattern | Problem | Solution | +|--------------|---------|----------| +| Wide denormalized tables | Poor performance, hard to maintain | Split into star schema | +| Snowflake (normalized dims) | Extra joins hurt performance | Flatten dimensions | +| Many-to-many without bridge | Ambiguous results | Add bridge/junction table | +| Mixed grain facts | Incorrect aggregations | Separate tables per grain | + +## Validation Checklist + +- [ ] Each table is clearly dimension or fact +- [ ] Fact tables have foreign keys to all related dimensions +- [ ] Dimensions have unique key columns +- [ ] Date table exists and is marked +- [ ] No circular relationship paths +- [ ] Consistent naming conventions diff --git a/.github/skills/pptx/LICENSE.txt b/.github/skills/pptx/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/.github/skills/pptx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/.github/skills/pptx/SKILL.md b/.github/skills/pptx/SKILL.md new file mode 100644 index 0000000..df5000e --- /dev/null +++ b/.github/skills/pptx/SKILL.md @@ -0,0 +1,232 @@ +--- +name: pptx +description: "Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted content will be used elsewhere, like in an email or summary); editing, modifying, or updating existing presentations; combining or splitting slide files; working with templates, layouts, speaker notes, or comments. Trigger whenever the user mentions \"deck,\" \"slides,\" \"presentation,\" or references a .pptx filename, regardless of what they plan to do with the content afterward. If a .pptx file needs to be opened, created, or touched, use this skill." +license: Proprietary. LICENSE.txt has complete terms +--- + +# PPTX Skill + +## Quick Reference + +| Task | Guide | +|------|-------| +| Read/analyze content | `python -m markitdown presentation.pptx` | +| Edit or create from template | Read [editing.md](editing.md) | +| Create from scratch | Read [pptxgenjs.md](pptxgenjs.md) | + +--- + +## Reading Content + +```bash +# Text extraction +python -m markitdown presentation.pptx + +# Visual overview +python scripts/thumbnail.py presentation.pptx + +# Raw XML +python scripts/office/unpack.py presentation.pptx unpacked/ +``` + +--- + +## Editing Workflow + +**Read [editing.md](editing.md) for full details.** + +1. Analyze template with `thumbnail.py` +2. Unpack → manipulate slides → edit content → clean → pack + +--- + +## Creating from Scratch + +**Read [pptxgenjs.md](pptxgenjs.md) for full details.** + +Use when no template or reference presentation is available. + +--- + +## Design Ideas + +**Don't create boring slides.** Plain bullets on a white background won't impress anyone. Consider ideas from this list for each slide. + +### Before Starting + +- **Pick a bold, content-informed color palette**: The palette should feel designed for THIS topic. If swapping your colors into a completely different presentation would still "work," you haven't made specific enough choices. +- **Dominance over equality**: One color should dominate (60-70% visual weight), with 1-2 supporting tones and one sharp accent. Never give all colors equal weight. +- **Dark/light contrast**: Dark backgrounds for title + conclusion slides, light for content ("sandwich" structure). Or commit to dark throughout for a premium feel. +- **Commit to a visual motif**: Pick ONE distinctive element and repeat it — rounded image frames, icons in colored circles, thick single-side borders. Carry it across every slide. + +### Color Palettes + +Choose colors that match your topic — don't default to generic blue. Use these palettes as inspiration: + +| Theme | Primary | Secondary | Accent | +|-------|---------|-----------|--------| +| **Midnight Executive** | `1E2761` (navy) | `CADCFC` (ice blue) | `FFFFFF` (white) | +| **Forest & Moss** | `2C5F2D` (forest) | `97BC62` (moss) | `F5F5F5` (cream) | +| **Coral Energy** | `F96167` (coral) | `F9E795` (gold) | `2F3C7E` (navy) | +| **Warm Terracotta** | `B85042` (terracotta) | `E7E8D1` (sand) | `A7BEAE` (sage) | +| **Ocean Gradient** | `065A82` (deep blue) | `1C7293` (teal) | `21295C` (midnight) | +| **Charcoal Minimal** | `36454F` (charcoal) | `F2F2F2` (off-white) | `212121` (black) | +| **Teal Trust** | `028090` (teal) | `00A896` (seafoam) | `02C39A` (mint) | +| **Berry & Cream** | `6D2E46` (berry) | `A26769` (dusty rose) | `ECE2D0` (cream) | +| **Sage Calm** | `84B59F` (sage) | `69A297` (eucalyptus) | `50808E` (slate) | +| **Cherry Bold** | `990011` (cherry) | `FCF6F5` (off-white) | `2F3C7E` (navy) | + +### For Each Slide + +**Every slide needs a visual element** — image, chart, icon, or shape. Text-only slides are forgettable. + +**Layout options:** +- Two-column (text left, illustration on right) +- Icon + text rows (icon in colored circle, bold header, description below) +- 2x2 or 2x3 grid (image on one side, grid of content blocks on other) +- Half-bleed image (full left or right side) with content overlay + +**Data display:** +- Large stat callouts (big numbers 60-72pt with small labels below) +- Comparison columns (before/after, pros/cons, side-by-side options) +- Timeline or process flow (numbered steps, arrows) + +**Visual polish:** +- Icons in small colored circles next to section headers +- Italic accent text for key stats or taglines + +### Typography + +**Choose an interesting font pairing** — don't default to Arial. Pick a header font with personality and pair it with a clean body font. + +| Header Font | Body Font | +|-------------|-----------| +| Georgia | Calibri | +| Arial Black | Arial | +| Calibri | Calibri Light | +| Cambria | Calibri | +| Trebuchet MS | Calibri | +| Impact | Arial | +| Palatino | Garamond | +| Consolas | Calibri | + +| Element | Size | +|---------|------| +| Slide title | 36-44pt bold | +| Section header | 20-24pt bold | +| Body text | 14-16pt | +| Captions | 10-12pt muted | + +### Spacing + +- 0.5" minimum margins +- 0.3-0.5" between content blocks +- Leave breathing room—don't fill every inch + +### Avoid (Common Mistakes) + +- **Don't repeat the same layout** — vary columns, cards, and callouts across slides +- **Don't center body text** — left-align paragraphs and lists; center only titles +- **Don't skimp on size contrast** — titles need 36pt+ to stand out from 14-16pt body +- **Don't default to blue** — pick colors that reflect the specific topic +- **Don't mix spacing randomly** — choose 0.3" or 0.5" gaps and use consistently +- **Don't style one slide and leave the rest plain** — commit fully or keep it simple throughout +- **Don't create text-only slides** — add images, icons, charts, or visual elements; avoid plain title + bullets +- **Don't forget text box padding** — when aligning lines or shapes with text edges, set `margin: 0` on the text box or offset the shape to account for padding +- **Don't use low-contrast elements** — icons AND text need strong contrast against the background; avoid light text on light backgrounds or dark text on dark backgrounds +- **NEVER use accent lines under titles** — these are a hallmark of AI-generated slides; use whitespace or background color instead + +--- + +## QA (Required) + +**Assume there are problems. Your job is to find them.** + +Your first render is almost never correct. Approach QA as a bug hunt, not a confirmation step. If you found zero issues on first inspection, you weren't looking hard enough. + +### Content QA + +```bash +python -m markitdown output.pptx +``` + +Check for missing content, typos, wrong order. + +**When using templates, check for leftover placeholder text:** + +```bash +python -m markitdown output.pptx | grep -iE "xxxx|lorem|ipsum|this.*(page|slide).*layout" +``` + +If grep returns results, fix them before declaring success. + +### Visual QA + +**⚠️ USE SUBAGENTS** — even for 2-3 slides. You've been staring at the code and will see what you expect, not what's there. Subagents have fresh eyes. + +Convert slides to images (see [Converting to Images](#converting-to-images)), then use this prompt: + +``` +Visually inspect these slides. Assume there are issues — find them. + +Look for: +- Overlapping elements (text through shapes, lines through words, stacked elements) +- Text overflow or cut off at edges/box boundaries +- Decorative lines positioned for single-line text but title wrapped to two lines +- Source citations or footers colliding with content above +- Elements too close (< 0.3" gaps) or cards/sections nearly touching +- Uneven gaps (large empty area in one place, cramped in another) +- Insufficient margin from slide edges (< 0.5") +- Columns or similar elements not aligned consistently +- Low-contrast text (e.g., light gray text on cream-colored background) +- Low-contrast icons (e.g., dark icons on dark backgrounds without a contrasting circle) +- Text boxes too narrow causing excessive wrapping +- Leftover placeholder content + +For each slide, list issues or areas of concern, even if minor. + +Read and analyze these images: +1. /path/to/slide-01.jpg (Expected: [brief description]) +2. /path/to/slide-02.jpg (Expected: [brief description]) + +Report ALL issues found, including minor ones. +``` + +### Verification Loop + +1. Generate slides → Convert to images → Inspect +2. **List issues found** (if none found, look again more critically) +3. Fix issues +4. **Re-verify affected slides** — one fix often creates another problem +5. Repeat until a full pass reveals no new issues + +**Do not declare success until you've completed at least one fix-and-verify cycle.** + +--- + +## Converting to Images + +Convert presentations to individual slide images for visual inspection: + +```bash +python scripts/office/soffice.py --headless --convert-to pdf output.pptx +pdftoppm -jpeg -r 150 output.pdf slide +``` + +This creates `slide-01.jpg`, `slide-02.jpg`, etc. + +To re-render specific slides after fixes: + +```bash +pdftoppm -jpeg -r 150 -f N -l N output.pdf slide-fixed +``` + +--- + +## Dependencies + +- `pip install "markitdown[pptx]"` - text extraction +- `pip install Pillow` - thumbnail grids +- `npm install -g pptxgenjs` - creating from scratch +- LibreOffice (`soffice`) - PDF conversion (auto-configured for sandboxed environments via `scripts/office/soffice.py`) +- Poppler (`pdftoppm`) - PDF to images diff --git a/.github/skills/pptx/editing.md b/.github/skills/pptx/editing.md new file mode 100644 index 0000000..f873e8a --- /dev/null +++ b/.github/skills/pptx/editing.md @@ -0,0 +1,205 @@ +# Editing Presentations + +## Template-Based Workflow + +When using an existing presentation as a template: + +1. **Analyze existing slides**: + ```bash + python scripts/thumbnail.py template.pptx + python -m markitdown template.pptx + ``` + Review `thumbnails.jpg` to see layouts, and markitdown output to see placeholder text. + +2. **Plan slide mapping**: For each content section, choose a template slide. + + ⚠️ **USE VARIED LAYOUTS** — monotonous presentations are a common failure mode. Don't default to basic title + bullet slides. Actively seek out: + - Multi-column layouts (2-column, 3-column) + - Image + text combinations + - Full-bleed images with text overlay + - Quote or callout slides + - Section dividers + - Stat/number callouts + - Icon grids or icon + text rows + + **Avoid:** Repeating the same text-heavy layout for every slide. + + Match content type to layout style (e.g., key points → bullet slide, team info → multi-column, testimonials → quote slide). + +3. **Unpack**: `python scripts/office/unpack.py template.pptx unpacked/` + +4. **Build presentation** (do this yourself, not with subagents): + - Delete unwanted slides (remove from ``) + - Duplicate slides you want to reuse (`add_slide.py`) + - Reorder slides in `` + - **Complete all structural changes before step 5** + +5. **Edit content**: Update text in each `slide{N}.xml`. + **Use subagents here if available** — slides are separate XML files, so subagents can edit in parallel. + +6. **Clean**: `python scripts/clean.py unpacked/` + +7. **Pack**: `python scripts/office/pack.py unpacked/ output.pptx --original template.pptx` + +--- + +## Scripts + +| Script | Purpose | +|--------|---------| +| `unpack.py` | Extract and pretty-print PPTX | +| `add_slide.py` | Duplicate slide or create from layout | +| `clean.py` | Remove orphaned files | +| `pack.py` | Repack with validation | +| `thumbnail.py` | Create visual grid of slides | + +### unpack.py + +```bash +python scripts/office/unpack.py input.pptx unpacked/ +``` + +Extracts PPTX, pretty-prints XML, escapes smart quotes. + +### add_slide.py + +```bash +python scripts/add_slide.py unpacked/ slide2.xml # Duplicate slide +python scripts/add_slide.py unpacked/ slideLayout2.xml # From layout +``` + +Prints `` to add to `` at desired position. + +### clean.py + +```bash +python scripts/clean.py unpacked/ +``` + +Removes slides not in ``, unreferenced media, orphaned rels. + +### pack.py + +```bash +python scripts/office/pack.py unpacked/ output.pptx --original input.pptx +``` + +Validates, repairs, condenses XML, re-encodes smart quotes. + +### thumbnail.py + +```bash +python scripts/thumbnail.py input.pptx [output_prefix] [--cols N] +``` + +Creates `thumbnails.jpg` with slide filenames as labels. Default 3 columns, max 12 per grid. + +**Use for template analysis only** (choosing layouts). For visual QA, use `soffice` + `pdftoppm` to create full-resolution individual slide images—see SKILL.md. + +--- + +## Slide Operations + +Slide order is in `ppt/presentation.xml` → ``. + +**Reorder**: Rearrange `` elements. + +**Delete**: Remove ``, then run `clean.py`. + +**Add**: Use `add_slide.py`. Never manually copy slide files—the script handles notes references, Content_Types.xml, and relationship IDs that manual copying misses. + +--- + +## Editing Content + +**Subagents:** If available, use them here (after completing step 4). Each slide is a separate XML file, so subagents can edit in parallel. In your prompt to subagents, include: +- The slide file path(s) to edit +- **"Use the Edit tool for all changes"** +- The formatting rules and common pitfalls below + +For each slide: +1. Read the slide's XML +2. Identify ALL placeholder content—text, images, charts, icons, captions +3. Replace each placeholder with final content + +**Use the Edit tool, not sed or Python scripts.** The Edit tool forces specificity about what to replace and where, yielding better reliability. + +### Formatting Rules + +- **Bold all headers, subheadings, and inline labels**: Use `b="1"` on ``. This includes: + - Slide titles + - Section headers within a slide + - Inline labels like (e.g.: "Status:", "Description:") at the start of a line +- **Never use unicode bullets (•)**: Use proper list formatting with `` or `` +- **Bullet consistency**: Let bullets inherit from the layout. Only specify `` or ``. + +--- + +## Common Pitfalls + +### Template Adaptation + +When source content has fewer items than the template: +- **Remove excess elements entirely** (images, shapes, text boxes), don't just clear text +- Check for orphaned visuals after clearing text content +- Run visual QA to catch mismatched counts + +When replacing text with different length content: +- **Shorter replacements**: Usually safe +- **Longer replacements**: May overflow or wrap unexpectedly +- Test with visual QA after text changes +- Consider truncating or splitting content to fit the template's design constraints + +**Template slots ≠ Source items**: If template has 4 team members but source has 3 users, delete the 4th member's entire group (image + text boxes), not just the text. + +### Multi-Item Content + +If source has multiple items (numbered lists, multiple sections), create separate `` elements for each — **never concatenate into one string**. + +**❌ WRONG** — all items in one paragraph: +```xml + + Step 1: Do the first thing. Step 2: Do the second thing. + +``` + +**✅ CORRECT** — separate paragraphs with bold headers: +```xml + + + Step 1 + + + + Do the first thing. + + + + Step 2 + + +``` + +Copy `` from the original paragraph to preserve line spacing. Use `b="1"` on headers. + +### Smart Quotes + +Handled automatically by unpack/pack. But the Edit tool converts smart quotes to ASCII. + +**When adding new text with quotes, use XML entities:** + +```xml +the “Agreement” +``` + +| Character | Name | Unicode | XML Entity | +|-----------|------|---------|------------| +| `“` | Left double quote | U+201C | `“` | +| `”` | Right double quote | U+201D | `”` | +| `‘` | Left single quote | U+2018 | `‘` | +| `’` | Right single quote | U+2019 | `’` | + +### Other + +- **Whitespace**: Use `xml:space="preserve"` on `` with leading/trailing spaces +- **XML parsing**: Use `defusedxml.minidom`, not `xml.etree.ElementTree` (corrupts namespaces) diff --git a/.github/skills/pptx/pptxgenjs.md b/.github/skills/pptx/pptxgenjs.md new file mode 100644 index 0000000..6bfed90 --- /dev/null +++ b/.github/skills/pptx/pptxgenjs.md @@ -0,0 +1,420 @@ +# PptxGenJS Tutorial + +## Setup & Basic Structure + +```javascript +const pptxgen = require("pptxgenjs"); + +let pres = new pptxgen(); +pres.layout = 'LAYOUT_16x9'; // or 'LAYOUT_16x10', 'LAYOUT_4x3', 'LAYOUT_WIDE' +pres.author = 'Your Name'; +pres.title = 'Presentation Title'; + +let slide = pres.addSlide(); +slide.addText("Hello World!", { x: 0.5, y: 0.5, fontSize: 36, color: "363636" }); + +pres.writeFile({ fileName: "Presentation.pptx" }); +``` + +## Layout Dimensions + +Slide dimensions (coordinates in inches): +- `LAYOUT_16x9`: 10" × 5.625" (default) +- `LAYOUT_16x10`: 10" × 6.25" +- `LAYOUT_4x3`: 10" × 7.5" +- `LAYOUT_WIDE`: 13.3" × 7.5" + +--- + +## Text & Formatting + +```javascript +// Basic text +slide.addText("Simple Text", { + x: 1, y: 1, w: 8, h: 2, fontSize: 24, fontFace: "Arial", + color: "363636", bold: true, align: "center", valign: "middle" +}); + +// Character spacing (use charSpacing, not letterSpacing which is silently ignored) +slide.addText("SPACED TEXT", { x: 1, y: 1, w: 8, h: 1, charSpacing: 6 }); + +// Rich text arrays +slide.addText([ + { text: "Bold ", options: { bold: true } }, + { text: "Italic ", options: { italic: true } } +], { x: 1, y: 3, w: 8, h: 1 }); + +// Multi-line text (requires breakLine: true) +slide.addText([ + { text: "Line 1", options: { breakLine: true } }, + { text: "Line 2", options: { breakLine: true } }, + { text: "Line 3" } // Last item doesn't need breakLine +], { x: 0.5, y: 0.5, w: 8, h: 2 }); + +// Text box margin (internal padding) +slide.addText("Title", { + x: 0.5, y: 0.3, w: 9, h: 0.6, + margin: 0 // Use 0 when aligning text with other elements like shapes or icons +}); +``` + +**Tip:** Text boxes have internal margin by default. Set `margin: 0` when you need text to align precisely with shapes, lines, or icons at the same x-position. + +--- + +## Lists & Bullets + +```javascript +// ✅ CORRECT: Multiple bullets +slide.addText([ + { text: "First item", options: { bullet: true, breakLine: true } }, + { text: "Second item", options: { bullet: true, breakLine: true } }, + { text: "Third item", options: { bullet: true } } +], { x: 0.5, y: 0.5, w: 8, h: 3 }); + +// ❌ WRONG: Never use unicode bullets +slide.addText("• First item", { ... }); // Creates double bullets + +// Sub-items and numbered lists +{ text: "Sub-item", options: { bullet: true, indentLevel: 1 } } +{ text: "First", options: { bullet: { type: "number" }, breakLine: true } } +``` + +--- + +## Shapes + +```javascript +slide.addShape(pres.shapes.RECTANGLE, { + x: 0.5, y: 0.8, w: 1.5, h: 3.0, + fill: { color: "FF0000" }, line: { color: "000000", width: 2 } +}); + +slide.addShape(pres.shapes.OVAL, { x: 4, y: 1, w: 2, h: 2, fill: { color: "0000FF" } }); + +slide.addShape(pres.shapes.LINE, { + x: 1, y: 3, w: 5, h: 0, line: { color: "FF0000", width: 3, dashType: "dash" } +}); + +// With transparency +slide.addShape(pres.shapes.RECTANGLE, { + x: 1, y: 1, w: 3, h: 2, + fill: { color: "0088CC", transparency: 50 } +}); + +// Rounded rectangle (rectRadius only works with ROUNDED_RECTANGLE, not RECTANGLE) +// ⚠️ Don't pair with rectangular accent overlays — they won't cover rounded corners. Use RECTANGLE instead. +slide.addShape(pres.shapes.ROUNDED_RECTANGLE, { + x: 1, y: 1, w: 3, h: 2, + fill: { color: "FFFFFF" }, rectRadius: 0.1 +}); + +// With shadow +slide.addShape(pres.shapes.RECTANGLE, { + x: 1, y: 1, w: 3, h: 2, + fill: { color: "FFFFFF" }, + shadow: { type: "outer", color: "000000", blur: 6, offset: 2, angle: 135, opacity: 0.15 } +}); +``` + +Shadow options: + +| Property | Type | Range | Notes | +|----------|------|-------|-------| +| `type` | string | `"outer"`, `"inner"` | | +| `color` | string | 6-char hex (e.g. `"000000"`) | No `#` prefix, no 8-char hex — see Common Pitfalls | +| `blur` | number | 0-100 pt | | +| `offset` | number | 0-200 pt | **Must be non-negative** — negative values corrupt the file | +| `angle` | number | 0-359 degrees | Direction the shadow falls (135 = bottom-right, 270 = upward) | +| `opacity` | number | 0.0-1.0 | Use this for transparency, never encode in color string | + +To cast a shadow upward (e.g. on a footer bar), use `angle: 270` with a positive offset — do **not** use a negative offset. + +**Note**: Gradient fills are not natively supported. Use a gradient image as a background instead. + +--- + +## Images + +### Image Sources + +```javascript +// From file path +slide.addImage({ path: "images/chart.png", x: 1, y: 1, w: 5, h: 3 }); + +// From URL +slide.addImage({ path: "https://example.com/image.jpg", x: 1, y: 1, w: 5, h: 3 }); + +// From base64 (faster, no file I/O) +slide.addImage({ data: "image/png;base64,iVBORw0KGgo...", x: 1, y: 1, w: 5, h: 3 }); +``` + +### Image Options + +```javascript +slide.addImage({ + path: "image.png", + x: 1, y: 1, w: 5, h: 3, + rotate: 45, // 0-359 degrees + rounding: true, // Circular crop + transparency: 50, // 0-100 + flipH: true, // Horizontal flip + flipV: false, // Vertical flip + altText: "Description", // Accessibility + hyperlink: { url: "https://example.com" } +}); +``` + +### Image Sizing Modes + +```javascript +// Contain - fit inside, preserve ratio +{ sizing: { type: 'contain', w: 4, h: 3 } } + +// Cover - fill area, preserve ratio (may crop) +{ sizing: { type: 'cover', w: 4, h: 3 } } + +// Crop - cut specific portion +{ sizing: { type: 'crop', x: 0.5, y: 0.5, w: 2, h: 2 } } +``` + +### Calculate Dimensions (preserve aspect ratio) + +```javascript +const origWidth = 1978, origHeight = 923, maxHeight = 3.0; +const calcWidth = maxHeight * (origWidth / origHeight); +const centerX = (10 - calcWidth) / 2; + +slide.addImage({ path: "image.png", x: centerX, y: 1.2, w: calcWidth, h: maxHeight }); +``` + +### Supported Formats + +- **Standard**: PNG, JPG, GIF (animated GIFs work in Microsoft 365) +- **SVG**: Works in modern PowerPoint/Microsoft 365 + +--- + +## Icons + +Use react-icons to generate SVG icons, then rasterize to PNG for universal compatibility. + +### Setup + +```javascript +const React = require("react"); +const ReactDOMServer = require("react-dom/server"); +const sharp = require("sharp"); +const { FaCheckCircle, FaChartLine } = require("react-icons/fa"); + +function renderIconSvg(IconComponent, color = "#000000", size = 256) { + return ReactDOMServer.renderToStaticMarkup( + React.createElement(IconComponent, { color, size: String(size) }) + ); +} + +async function iconToBase64Png(IconComponent, color, size = 256) { + const svg = renderIconSvg(IconComponent, color, size); + const pngBuffer = await sharp(Buffer.from(svg)).png().toBuffer(); + return "image/png;base64," + pngBuffer.toString("base64"); +} +``` + +### Add Icon to Slide + +```javascript +const iconData = await iconToBase64Png(FaCheckCircle, "#4472C4", 256); + +slide.addImage({ + data: iconData, + x: 1, y: 1, w: 0.5, h: 0.5 // Size in inches +}); +``` + +**Note**: Use size 256 or higher for crisp icons. The size parameter controls the rasterization resolution, not the display size on the slide (which is set by `w` and `h` in inches). + +### Icon Libraries + +Install: `npm install -g react-icons react react-dom sharp` + +Popular icon sets in react-icons: +- `react-icons/fa` - Font Awesome +- `react-icons/md` - Material Design +- `react-icons/hi` - Heroicons +- `react-icons/bi` - Bootstrap Icons + +--- + +## Slide Backgrounds + +```javascript +// Solid color +slide.background = { color: "F1F1F1" }; + +// Color with transparency +slide.background = { color: "FF3399", transparency: 50 }; + +// Image from URL +slide.background = { path: "https://example.com/bg.jpg" }; + +// Image from base64 +slide.background = { data: "image/png;base64,iVBORw0KGgo..." }; +``` + +--- + +## Tables + +```javascript +slide.addTable([ + ["Header 1", "Header 2"], + ["Cell 1", "Cell 2"] +], { + x: 1, y: 1, w: 8, h: 2, + border: { pt: 1, color: "999999" }, fill: { color: "F1F1F1" } +}); + +// Advanced with merged cells +let tableData = [ + [{ text: "Header", options: { fill: { color: "6699CC" }, color: "FFFFFF", bold: true } }, "Cell"], + [{ text: "Merged", options: { colspan: 2 } }] +]; +slide.addTable(tableData, { x: 1, y: 3.5, w: 8, colW: [4, 4] }); +``` + +--- + +## Charts + +```javascript +// Bar chart +slide.addChart(pres.charts.BAR, [{ + name: "Sales", labels: ["Q1", "Q2", "Q3", "Q4"], values: [4500, 5500, 6200, 7100] +}], { + x: 0.5, y: 0.6, w: 6, h: 3, barDir: 'col', + showTitle: true, title: 'Quarterly Sales' +}); + +// Line chart +slide.addChart(pres.charts.LINE, [{ + name: "Temp", labels: ["Jan", "Feb", "Mar"], values: [32, 35, 42] +}], { x: 0.5, y: 4, w: 6, h: 3, lineSize: 3, lineSmooth: true }); + +// Pie chart +slide.addChart(pres.charts.PIE, [{ + name: "Share", labels: ["A", "B", "Other"], values: [35, 45, 20] +}], { x: 7, y: 1, w: 5, h: 4, showPercent: true }); +``` + +### Better-Looking Charts + +Default charts look dated. Apply these options for a modern, clean appearance: + +```javascript +slide.addChart(pres.charts.BAR, chartData, { + x: 0.5, y: 1, w: 9, h: 4, barDir: "col", + + // Custom colors (match your presentation palette) + chartColors: ["0D9488", "14B8A6", "5EEAD4"], + + // Clean background + chartArea: { fill: { color: "FFFFFF" }, roundedCorners: true }, + + // Muted axis labels + catAxisLabelColor: "64748B", + valAxisLabelColor: "64748B", + + // Subtle grid (value axis only) + valGridLine: { color: "E2E8F0", size: 0.5 }, + catGridLine: { style: "none" }, + + // Data labels on bars + showValue: true, + dataLabelPosition: "outEnd", + dataLabelColor: "1E293B", + + // Hide legend for single series + showLegend: false, +}); +``` + +**Key styling options:** +- `chartColors: [...]` - hex colors for series/segments +- `chartArea: { fill, border, roundedCorners }` - chart background +- `catGridLine/valGridLine: { color, style, size }` - grid lines (`style: "none"` to hide) +- `lineSmooth: true` - curved lines (line charts) +- `legendPos: "r"` - legend position: "b", "t", "l", "r", "tr" + +--- + +## Slide Masters + +```javascript +pres.defineSlideMaster({ + title: 'TITLE_SLIDE', background: { color: '283A5E' }, + objects: [{ + placeholder: { options: { name: 'title', type: 'title', x: 1, y: 2, w: 8, h: 2 } } + }] +}); + +let titleSlide = pres.addSlide({ masterName: "TITLE_SLIDE" }); +titleSlide.addText("My Title", { placeholder: "title" }); +``` + +--- + +## Common Pitfalls + +⚠️ These issues cause file corruption, visual bugs, or broken output. Avoid them. + +1. **NEVER use "#" with hex colors** - causes file corruption + ```javascript + color: "FF0000" // ✅ CORRECT + color: "#FF0000" // ❌ WRONG + ``` + +2. **NEVER encode opacity in hex color strings** - 8-char colors (e.g., `"00000020"`) corrupt the file. Use the `opacity` property instead. + ```javascript + shadow: { type: "outer", blur: 6, offset: 2, color: "00000020" } // ❌ CORRUPTS FILE + shadow: { type: "outer", blur: 6, offset: 2, color: "000000", opacity: 0.12 } // ✅ CORRECT + ``` + +3. **Use `bullet: true`** - NEVER unicode symbols like "•" (creates double bullets) + +4. **Use `breakLine: true`** between array items or text runs together + +5. **Avoid `lineSpacing` with bullets** - causes excessive gaps; use `paraSpaceAfter` instead + +6. **Each presentation needs fresh instance** - don't reuse `pptxgen()` objects + +7. **NEVER reuse option objects across calls** - PptxGenJS mutates objects in-place (e.g. converting shadow values to EMU). Sharing one object between multiple calls corrupts the second shape. + ```javascript + const shadow = { type: "outer", blur: 6, offset: 2, color: "000000", opacity: 0.15 }; + slide.addShape(pres.shapes.RECTANGLE, { shadow, ... }); // ❌ second call gets already-converted values + slide.addShape(pres.shapes.RECTANGLE, { shadow, ... }); + + const makeShadow = () => ({ type: "outer", blur: 6, offset: 2, color: "000000", opacity: 0.15 }); + slide.addShape(pres.shapes.RECTANGLE, { shadow: makeShadow(), ... }); // ✅ fresh object each time + slide.addShape(pres.shapes.RECTANGLE, { shadow: makeShadow(), ... }); + ``` + +8. **Don't use `ROUNDED_RECTANGLE` with accent borders** - rectangular overlay bars won't cover rounded corners. Use `RECTANGLE` instead. + ```javascript + // ❌ WRONG: Accent bar doesn't cover rounded corners + slide.addShape(pres.shapes.ROUNDED_RECTANGLE, { x: 1, y: 1, w: 3, h: 1.5, fill: { color: "FFFFFF" } }); + slide.addShape(pres.shapes.RECTANGLE, { x: 1, y: 1, w: 0.08, h: 1.5, fill: { color: "0891B2" } }); + + // ✅ CORRECT: Use RECTANGLE for clean alignment + slide.addShape(pres.shapes.RECTANGLE, { x: 1, y: 1, w: 3, h: 1.5, fill: { color: "FFFFFF" } }); + slide.addShape(pres.shapes.RECTANGLE, { x: 1, y: 1, w: 0.08, h: 1.5, fill: { color: "0891B2" } }); + ``` + +--- + +## Quick Reference + +- **Shapes**: RECTANGLE, OVAL, LINE, ROUNDED_RECTANGLE +- **Charts**: BAR, LINE, PIE, DOUGHNUT, SCATTER, BUBBLE, RADAR +- **Layouts**: LAYOUT_16x9 (10"×5.625"), LAYOUT_16x10, LAYOUT_4x3, LAYOUT_WIDE +- **Alignment**: "left", "center", "right" +- **Chart data labels**: "outEnd", "inEnd", "center" diff --git a/.github/skills/pptx/scripts/__init__.py b/.github/skills/pptx/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.github/skills/pptx/scripts/add_slide.py b/.github/skills/pptx/scripts/add_slide.py new file mode 100644 index 0000000..13700df --- /dev/null +++ b/.github/skills/pptx/scripts/add_slide.py @@ -0,0 +1,195 @@ +"""Add a new slide to an unpacked PPTX directory. + +Usage: python add_slide.py + +The source can be: + - A slide file (e.g., slide2.xml) - duplicates the slide + - A layout file (e.g., slideLayout2.xml) - creates from layout + +Examples: + python add_slide.py unpacked/ slide2.xml + # Duplicates slide2, creates slide5.xml + + python add_slide.py unpacked/ slideLayout2.xml + # Creates slide5.xml from slideLayout2.xml + +To see available layouts: ls unpacked/ppt/slideLayouts/ + +Prints the element to add to presentation.xml. +""" + +import re +import shutil +import sys +from pathlib import Path + + +def get_next_slide_number(slides_dir: Path) -> int: + existing = [int(m.group(1)) for f in slides_dir.glob("slide*.xml") + if (m := re.match(r"slide(\d+)\.xml", f.name))] + return max(existing) + 1 if existing else 1 + + +def create_slide_from_layout(unpacked_dir: Path, layout_file: str) -> None: + slides_dir = unpacked_dir / "ppt" / "slides" + rels_dir = slides_dir / "_rels" + layouts_dir = unpacked_dir / "ppt" / "slideLayouts" + + layout_path = layouts_dir / layout_file + if not layout_path.exists(): + print(f"Error: {layout_path} not found", file=sys.stderr) + sys.exit(1) + + next_num = get_next_slide_number(slides_dir) + dest = f"slide{next_num}.xml" + dest_slide = slides_dir / dest + dest_rels = rels_dir / f"{dest}.rels" + + slide_xml = ''' + + + + + + + + + + + + + + + + + + + + + +''' + dest_slide.write_text(slide_xml, encoding="utf-8") + + rels_dir.mkdir(exist_ok=True) + rels_xml = f''' + + +''' + dest_rels.write_text(rels_xml, encoding="utf-8") + + _add_to_content_types(unpacked_dir, dest) + + rid = _add_to_presentation_rels(unpacked_dir, dest) + + next_slide_id = _get_next_slide_id(unpacked_dir) + + print(f"Created {dest} from {layout_file}") + print(f'Add to presentation.xml : ') + + +def duplicate_slide(unpacked_dir: Path, source: str) -> None: + slides_dir = unpacked_dir / "ppt" / "slides" + rels_dir = slides_dir / "_rels" + + source_slide = slides_dir / source + + if not source_slide.exists(): + print(f"Error: {source_slide} not found", file=sys.stderr) + sys.exit(1) + + next_num = get_next_slide_number(slides_dir) + dest = f"slide{next_num}.xml" + dest_slide = slides_dir / dest + + source_rels = rels_dir / f"{source}.rels" + dest_rels = rels_dir / f"{dest}.rels" + + shutil.copy2(source_slide, dest_slide) + + if source_rels.exists(): + shutil.copy2(source_rels, dest_rels) + + rels_content = dest_rels.read_text(encoding="utf-8") + rels_content = re.sub( + r'\s*]*Type="[^"]*notesSlide"[^>]*/>\s*', + "\n", + rels_content, + ) + dest_rels.write_text(rels_content, encoding="utf-8") + + _add_to_content_types(unpacked_dir, dest) + + rid = _add_to_presentation_rels(unpacked_dir, dest) + + next_slide_id = _get_next_slide_id(unpacked_dir) + + print(f"Created {dest} from {source}") + print(f'Add to presentation.xml : ') + + +def _add_to_content_types(unpacked_dir: Path, dest: str) -> None: + content_types_path = unpacked_dir / "[Content_Types].xml" + content_types = content_types_path.read_text(encoding="utf-8") + + new_override = f'' + + if f"/ppt/slides/{dest}" not in content_types: + content_types = content_types.replace("", f" {new_override}\n") + content_types_path.write_text(content_types, encoding="utf-8") + + +def _add_to_presentation_rels(unpacked_dir: Path, dest: str) -> str: + pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels" + pres_rels = pres_rels_path.read_text(encoding="utf-8") + + rids = [int(m) for m in re.findall(r'Id="rId(\d+)"', pres_rels)] + next_rid = max(rids) + 1 if rids else 1 + rid = f"rId{next_rid}" + + new_rel = f'' + + if f"slides/{dest}" not in pres_rels: + pres_rels = pres_rels.replace("", f" {new_rel}\n") + pres_rels_path.write_text(pres_rels, encoding="utf-8") + + return rid + + +def _get_next_slide_id(unpacked_dir: Path) -> int: + pres_path = unpacked_dir / "ppt" / "presentation.xml" + pres_content = pres_path.read_text(encoding="utf-8") + slide_ids = [int(m) for m in re.findall(r']*id="(\d+)"', pres_content)] + return max(slide_ids) + 1 if slide_ids else 256 + + +def parse_source(source: str) -> tuple[str, str | None]: + if source.startswith("slideLayout") and source.endswith(".xml"): + return ("layout", source) + + return ("slide", None) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python add_slide.py ", file=sys.stderr) + print("", file=sys.stderr) + print("Source can be:", file=sys.stderr) + print(" slide2.xml - duplicate an existing slide", file=sys.stderr) + print(" slideLayout2.xml - create from a layout template", file=sys.stderr) + print("", file=sys.stderr) + print("To see available layouts: ls /ppt/slideLayouts/", file=sys.stderr) + sys.exit(1) + + unpacked_dir = Path(sys.argv[1]) + source = sys.argv[2] + + if not unpacked_dir.exists(): + print(f"Error: {unpacked_dir} not found", file=sys.stderr) + sys.exit(1) + + source_type, layout_file = parse_source(source) + + if source_type == "layout" and layout_file is not None: + create_slide_from_layout(unpacked_dir, layout_file) + else: + duplicate_slide(unpacked_dir, source) diff --git a/.github/skills/pptx/scripts/clean.py b/.github/skills/pptx/scripts/clean.py new file mode 100644 index 0000000..3d13994 --- /dev/null +++ b/.github/skills/pptx/scripts/clean.py @@ -0,0 +1,286 @@ +"""Remove unreferenced files from an unpacked PPTX directory. + +Usage: python clean.py + +Example: + python clean.py unpacked/ + +This script removes: +- Orphaned slides (not in sldIdLst) and their relationships +- [trash] directory (unreferenced files) +- Orphaned .rels files for deleted resources +- Unreferenced media, embeddings, charts, diagrams, drawings, ink files +- Unreferenced theme files +- Unreferenced notes slides +- Content-Type overrides for deleted files +""" + +import sys +from pathlib import Path + +import defusedxml.minidom + + +import re + + +def get_slides_in_sldidlst(unpacked_dir: Path) -> set[str]: + pres_path = unpacked_dir / "ppt" / "presentation.xml" + pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels" + + if not pres_path.exists() or not pres_rels_path.exists(): + return set() + + rels_dom = defusedxml.minidom.parse(str(pres_rels_path)) + rid_to_slide = {} + for rel in rels_dom.getElementsByTagName("Relationship"): + rid = rel.getAttribute("Id") + target = rel.getAttribute("Target") + rel_type = rel.getAttribute("Type") + if "slide" in rel_type and target.startswith("slides/"): + rid_to_slide[rid] = target.replace("slides/", "") + + pres_content = pres_path.read_text(encoding="utf-8") + referenced_rids = set(re.findall(r']*r:id="([^"]+)"', pres_content)) + + return {rid_to_slide[rid] for rid in referenced_rids if rid in rid_to_slide} + + +def remove_orphaned_slides(unpacked_dir: Path) -> list[str]: + slides_dir = unpacked_dir / "ppt" / "slides" + slides_rels_dir = slides_dir / "_rels" + pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels" + + if not slides_dir.exists(): + return [] + + referenced_slides = get_slides_in_sldidlst(unpacked_dir) + removed = [] + + for slide_file in slides_dir.glob("slide*.xml"): + if slide_file.name not in referenced_slides: + rel_path = slide_file.relative_to(unpacked_dir) + slide_file.unlink() + removed.append(str(rel_path)) + + rels_file = slides_rels_dir / f"{slide_file.name}.rels" + if rels_file.exists(): + rels_file.unlink() + removed.append(str(rels_file.relative_to(unpacked_dir))) + + if removed and pres_rels_path.exists(): + rels_dom = defusedxml.minidom.parse(str(pres_rels_path)) + changed = False + + for rel in list(rels_dom.getElementsByTagName("Relationship")): + target = rel.getAttribute("Target") + if target.startswith("slides/"): + slide_name = target.replace("slides/", "") + if slide_name not in referenced_slides: + if rel.parentNode: + rel.parentNode.removeChild(rel) + changed = True + + if changed: + with open(pres_rels_path, "wb") as f: + f.write(rels_dom.toxml(encoding="utf-8")) + + return removed + + +def remove_trash_directory(unpacked_dir: Path) -> list[str]: + trash_dir = unpacked_dir / "[trash]" + removed = [] + + if trash_dir.exists() and trash_dir.is_dir(): + for file_path in trash_dir.iterdir(): + if file_path.is_file(): + rel_path = file_path.relative_to(unpacked_dir) + removed.append(str(rel_path)) + file_path.unlink() + trash_dir.rmdir() + + return removed + + +def get_slide_referenced_files(unpacked_dir: Path) -> set: + referenced = set() + slides_rels_dir = unpacked_dir / "ppt" / "slides" / "_rels" + + if not slides_rels_dir.exists(): + return referenced + + for rels_file in slides_rels_dir.glob("*.rels"): + dom = defusedxml.minidom.parse(str(rels_file)) + for rel in dom.getElementsByTagName("Relationship"): + target = rel.getAttribute("Target") + if not target: + continue + target_path = (rels_file.parent.parent / target).resolve() + try: + referenced.add(target_path.relative_to(unpacked_dir.resolve())) + except ValueError: + pass + + return referenced + + +def remove_orphaned_rels_files(unpacked_dir: Path) -> list[str]: + resource_dirs = ["charts", "diagrams", "drawings"] + removed = [] + slide_referenced = get_slide_referenced_files(unpacked_dir) + + for dir_name in resource_dirs: + rels_dir = unpacked_dir / "ppt" / dir_name / "_rels" + if not rels_dir.exists(): + continue + + for rels_file in rels_dir.glob("*.rels"): + resource_file = rels_dir.parent / rels_file.name.replace(".rels", "") + try: + resource_rel_path = resource_file.resolve().relative_to(unpacked_dir.resolve()) + except ValueError: + continue + + if not resource_file.exists() or resource_rel_path not in slide_referenced: + rels_file.unlink() + rel_path = rels_file.relative_to(unpacked_dir) + removed.append(str(rel_path)) + + return removed + + +def get_referenced_files(unpacked_dir: Path) -> set: + referenced = set() + + for rels_file in unpacked_dir.rglob("*.rels"): + dom = defusedxml.minidom.parse(str(rels_file)) + for rel in dom.getElementsByTagName("Relationship"): + target = rel.getAttribute("Target") + if not target: + continue + target_path = (rels_file.parent.parent / target).resolve() + try: + referenced.add(target_path.relative_to(unpacked_dir.resolve())) + except ValueError: + pass + + return referenced + + +def remove_orphaned_files(unpacked_dir: Path, referenced: set) -> list[str]: + resource_dirs = ["media", "embeddings", "charts", "diagrams", "tags", "drawings", "ink"] + removed = [] + + for dir_name in resource_dirs: + dir_path = unpacked_dir / "ppt" / dir_name + if not dir_path.exists(): + continue + + for file_path in dir_path.glob("*"): + if not file_path.is_file(): + continue + rel_path = file_path.relative_to(unpacked_dir) + if rel_path not in referenced: + file_path.unlink() + removed.append(str(rel_path)) + + theme_dir = unpacked_dir / "ppt" / "theme" + if theme_dir.exists(): + for file_path in theme_dir.glob("theme*.xml"): + rel_path = file_path.relative_to(unpacked_dir) + if rel_path not in referenced: + file_path.unlink() + removed.append(str(rel_path)) + theme_rels = theme_dir / "_rels" / f"{file_path.name}.rels" + if theme_rels.exists(): + theme_rels.unlink() + removed.append(str(theme_rels.relative_to(unpacked_dir))) + + notes_dir = unpacked_dir / "ppt" / "notesSlides" + if notes_dir.exists(): + for file_path in notes_dir.glob("*.xml"): + if not file_path.is_file(): + continue + rel_path = file_path.relative_to(unpacked_dir) + if rel_path not in referenced: + file_path.unlink() + removed.append(str(rel_path)) + + notes_rels_dir = notes_dir / "_rels" + if notes_rels_dir.exists(): + for file_path in notes_rels_dir.glob("*.rels"): + notes_file = notes_dir / file_path.name.replace(".rels", "") + if not notes_file.exists(): + file_path.unlink() + removed.append(str(file_path.relative_to(unpacked_dir))) + + return removed + + +def update_content_types(unpacked_dir: Path, removed_files: list[str]) -> None: + ct_path = unpacked_dir / "[Content_Types].xml" + if not ct_path.exists(): + return + + dom = defusedxml.minidom.parse(str(ct_path)) + changed = False + + for override in list(dom.getElementsByTagName("Override")): + part_name = override.getAttribute("PartName").lstrip("/") + if part_name in removed_files: + if override.parentNode: + override.parentNode.removeChild(override) + changed = True + + if changed: + with open(ct_path, "wb") as f: + f.write(dom.toxml(encoding="utf-8")) + + +def clean_unused_files(unpacked_dir: Path) -> list[str]: + all_removed = [] + + slides_removed = remove_orphaned_slides(unpacked_dir) + all_removed.extend(slides_removed) + + trash_removed = remove_trash_directory(unpacked_dir) + all_removed.extend(trash_removed) + + while True: + removed_rels = remove_orphaned_rels_files(unpacked_dir) + referenced = get_referenced_files(unpacked_dir) + removed_files = remove_orphaned_files(unpacked_dir, referenced) + + total_removed = removed_rels + removed_files + if not total_removed: + break + + all_removed.extend(total_removed) + + if all_removed: + update_content_types(unpacked_dir, all_removed) + + return all_removed + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python clean.py ", file=sys.stderr) + print("Example: python clean.py unpacked/", file=sys.stderr) + sys.exit(1) + + unpacked_dir = Path(sys.argv[1]) + + if not unpacked_dir.exists(): + print(f"Error: {unpacked_dir} not found", file=sys.stderr) + sys.exit(1) + + removed = clean_unused_files(unpacked_dir) + + if removed: + print(f"Removed {len(removed)} unreferenced files:") + for f in removed: + print(f" {f}") + else: + print("No unreferenced files found") diff --git a/.github/skills/pptx/scripts/office/helpers/__init__.py b/.github/skills/pptx/scripts/office/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.github/skills/pptx/scripts/office/helpers/merge_runs.py b/.github/skills/pptx/scripts/office/helpers/merge_runs.py new file mode 100644 index 0000000..ad7c25e --- /dev/null +++ b/.github/skills/pptx/scripts/office/helpers/merge_runs.py @@ -0,0 +1,199 @@ +"""Merge adjacent runs with identical formatting in DOCX. + +Merges adjacent elements that have identical properties. +Works on runs in paragraphs and inside tracked changes (, ). + +Also: +- Removes rsid attributes from runs (revision metadata that doesn't affect rendering) +- Removes proofErr elements (spell/grammar markers that block merging) +""" + +from pathlib import Path + +import defusedxml.minidom + + +def merge_runs(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + _remove_elements(root, "proofErr") + _strip_run_rsid_attrs(root) + + containers = {run.parentNode for run in _find_elements(root, "r")} + + merge_count = 0 + for container in containers: + merge_count += _merge_runs_in(container) + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Merged {merge_count} runs" + + except Exception as e: + return 0, f"Error: {e}" + + + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def _get_child(parent, tag: str): + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + return child + return None + + +def _get_children(parent, tag: str) -> list: + results = [] + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(child) + return results + + +def _is_adjacent(elem1, elem2) -> bool: + node = elem1.nextSibling + while node: + if node == elem2: + return True + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + return False + + + + +def _remove_elements(root, tag: str): + for elem in _find_elements(root, tag): + if elem.parentNode: + elem.parentNode.removeChild(elem) + + +def _strip_run_rsid_attrs(root): + for run in _find_elements(root, "r"): + for attr in list(run.attributes.values()): + if "rsid" in attr.name.lower(): + run.removeAttribute(attr.name) + + + + +def _merge_runs_in(container) -> int: + merge_count = 0 + run = _first_child_run(container) + + while run: + while True: + next_elem = _next_element_sibling(run) + if next_elem and _is_run(next_elem) and _can_merge(run, next_elem): + _merge_run_content(run, next_elem) + container.removeChild(next_elem) + merge_count += 1 + else: + break + + _consolidate_text(run) + run = _next_sibling_run(run) + + return merge_count + + +def _first_child_run(container): + for child in container.childNodes: + if child.nodeType == child.ELEMENT_NODE and _is_run(child): + return child + return None + + +def _next_element_sibling(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + return sibling + sibling = sibling.nextSibling + return None + + +def _next_sibling_run(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + if _is_run(sibling): + return sibling + sibling = sibling.nextSibling + return None + + +def _is_run(node) -> bool: + name = node.localName or node.tagName + return name == "r" or name.endswith(":r") + + +def _can_merge(run1, run2) -> bool: + rpr1 = _get_child(run1, "rPr") + rpr2 = _get_child(run2, "rPr") + + if (rpr1 is None) != (rpr2 is None): + return False + if rpr1 is None: + return True + return rpr1.toxml() == rpr2.toxml() + + +def _merge_run_content(target, source): + for child in list(source.childNodes): + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name != "rPr" and not name.endswith(":rPr"): + target.appendChild(child) + + +def _consolidate_text(run): + t_elements = _get_children(run, "t") + + for i in range(len(t_elements) - 1, 0, -1): + curr, prev = t_elements[i], t_elements[i - 1] + + if _is_adjacent(prev, curr): + prev_text = prev.firstChild.data if prev.firstChild else "" + curr_text = curr.firstChild.data if curr.firstChild else "" + merged = prev_text + curr_text + + if prev.firstChild: + prev.firstChild.data = merged + else: + prev.appendChild(run.ownerDocument.createTextNode(merged)) + + if merged.startswith(" ") or merged.endswith(" "): + prev.setAttribute("xml:space", "preserve") + elif prev.hasAttribute("xml:space"): + prev.removeAttribute("xml:space") + + run.removeChild(curr) diff --git a/.github/skills/pptx/scripts/office/helpers/simplify_redlines.py b/.github/skills/pptx/scripts/office/helpers/simplify_redlines.py new file mode 100644 index 0000000..db963bb --- /dev/null +++ b/.github/skills/pptx/scripts/office/helpers/simplify_redlines.py @@ -0,0 +1,197 @@ +"""Simplify tracked changes by merging adjacent w:ins or w:del elements. + +Merges adjacent elements from the same author into a single element. +Same for elements. This makes heavily-redlined documents easier to +work with by reducing the number of tracked change wrappers. + +Rules: +- Only merges w:ins with w:ins, w:del with w:del (same element type) +- Only merges if same author (ignores timestamp differences) +- Only merges if truly adjacent (only whitespace between them) +""" + +import xml.etree.ElementTree as ET +import zipfile +from pathlib import Path + +import defusedxml.minidom + +WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def simplify_redlines(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + merge_count = 0 + + containers = _find_elements(root, "p") + _find_elements(root, "tc") + + for container in containers: + merge_count += _merge_tracked_changes_in(container, "ins") + merge_count += _merge_tracked_changes_in(container, "del") + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Simplified {merge_count} tracked changes" + + except Exception as e: + return 0, f"Error: {e}" + + +def _merge_tracked_changes_in(container, tag: str) -> int: + merge_count = 0 + + tracked = [ + child + for child in container.childNodes + if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag) + ] + + if len(tracked) < 2: + return 0 + + i = 0 + while i < len(tracked) - 1: + curr = tracked[i] + next_elem = tracked[i + 1] + + if _can_merge_tracked(curr, next_elem): + _merge_tracked_content(curr, next_elem) + container.removeChild(next_elem) + tracked.pop(i + 1) + merge_count += 1 + else: + i += 1 + + return merge_count + + +def _is_element(node, tag: str) -> bool: + name = node.localName or node.tagName + return name == tag or name.endswith(f":{tag}") + + +def _get_author(elem) -> str: + author = elem.getAttribute("w:author") + if not author: + for attr in elem.attributes.values(): + if attr.localName == "author" or attr.name.endswith(":author"): + return attr.value + return author + + +def _can_merge_tracked(elem1, elem2) -> bool: + if _get_author(elem1) != _get_author(elem2): + return False + + node = elem1.nextSibling + while node and node != elem2: + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + + return True + + +def _merge_tracked_content(target, source): + while source.firstChild: + child = source.firstChild + source.removeChild(child) + target.appendChild(child) + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]: + if not doc_xml_path.exists(): + return {} + + try: + tree = ET.parse(doc_xml_path) + root = tree.getroot() + except ET.ParseError: + return {} + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + + return authors + + +def _get_authors_from_docx(docx_path: Path) -> dict[str, int]: + try: + with zipfile.ZipFile(docx_path, "r") as zf: + if "word/document.xml" not in zf.namelist(): + return {} + with zf.open("word/document.xml") as f: + tree = ET.parse(f) + root = tree.getroot() + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + return authors + except (zipfile.BadZipFile, ET.ParseError): + return {} + + +def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str: + modified_xml = modified_dir / "word" / "document.xml" + modified_authors = get_tracked_change_authors(modified_xml) + + if not modified_authors: + return default + + original_authors = _get_authors_from_docx(original_docx) + + new_changes: dict[str, int] = {} + for author, count in modified_authors.items(): + original_count = original_authors.get(author, 0) + diff = count - original_count + if diff > 0: + new_changes[author] = diff + + if not new_changes: + return default + + if len(new_changes) == 1: + return next(iter(new_changes)) + + raise ValueError( + f"Multiple authors added new changes: {new_changes}. " + "Cannot infer which author to validate." + ) diff --git a/.github/skills/pptx/scripts/office/pack.py b/.github/skills/pptx/scripts/office/pack.py new file mode 100644 index 0000000..db29ed8 --- /dev/null +++ b/.github/skills/pptx/scripts/office/pack.py @@ -0,0 +1,159 @@ +"""Pack a directory into a DOCX, PPTX, or XLSX file. + +Validates with auto-repair, condenses XML formatting, and creates the Office file. + +Usage: + python pack.py [--original ] [--validate true|false] + +Examples: + python pack.py unpacked/ output.docx --original input.docx + python pack.py unpacked/ output.pptx --validate false +""" + +import argparse +import sys +import shutil +import tempfile +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + +def pack( + input_directory: str, + output_file: str, + original_file: str | None = None, + validate: bool = True, + infer_author_func=None, +) -> tuple[None, str]: + input_dir = Path(input_directory) + output_path = Path(output_file) + suffix = output_path.suffix.lower() + + if not input_dir.is_dir(): + return None, f"Error: {input_dir} is not a directory" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {output_file} must be a .docx, .pptx, or .xlsx file" + + if validate and original_file: + original_path = Path(original_file) + if original_path.exists(): + success, output = _run_validation( + input_dir, original_path, suffix, infer_author_func + ) + if output: + print(output) + if not success: + return None, f"Error: Validation failed for {input_dir}" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + _condense_xml(xml_file) + + output_path.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + return None, f"Successfully packed {input_dir} to {output_file}" + + +def _run_validation( + unpacked_dir: Path, + original_file: Path, + suffix: str, + infer_author_func=None, +) -> tuple[bool, str | None]: + output_lines = [] + validators = [] + + if suffix == ".docx": + author = "Claude" + if infer_author_func: + try: + author = infer_author_func(unpacked_dir, original_file) + except ValueError as e: + print(f"Warning: {e} Using default author 'Claude'.", file=sys.stderr) + + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file), + RedliningValidator(unpacked_dir, original_file, author=author), + ] + elif suffix == ".pptx": + validators = [PPTXSchemaValidator(unpacked_dir, original_file)] + + if not validators: + return True, None + + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + output_lines.append(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + output_lines.append("All validations PASSED!") + + return success, "\n".join(output_lines) if output_lines else None + + +def _condense_xml(xml_file: Path) -> None: + try: + with open(xml_file, encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + for element in dom.getElementsByTagName("*"): + if element.tagName.endswith(":t"): + continue + + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + except Exception as e: + print(f"ERROR: Failed to parse {xml_file.name}: {e}", file=sys.stderr) + raise + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pack a directory into a DOCX, PPTX, or XLSX file" + ) + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument( + "--original", + help="Original file for validation comparison", + ) + parser.add_argument( + "--validate", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Run validation with auto-repair (default: true)", + ) + args = parser.parse_args() + + _, message = pack( + args.input_directory, + args.output_file, + original_file=args.original, + validate=args.validate, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 0000000..6454ef9 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 0000000..afa4f46 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 0000000..64e66b8 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 0000000..687eea8 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 0000000..6ac81b0 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 0000000..1dbf051 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..f1af17d --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..0a185ab --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 0000000..14ef488 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 0000000..c20f3bf --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 0000000..ac60252 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 0000000..424b8ba --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 0000000..2bddce2 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 0000000..8a8c18b --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 0000000..5c42706 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 0000000..853c341 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 0000000..da835ee --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 0000000..87ad265 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 0000000..9e86f1b --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 0000000..d0be42e --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 0000000..8821dd1 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 0000000..ca2575c --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 0000000..dd079e6 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..3dd6cf6 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..f1041e3 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 0000000..9c5b7a6 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 0000000..0f13678 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 0000000..a6de9d2 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 0000000..10e978b --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd b/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 0000000..4248bf7 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd b/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 0000000..5649746 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/mce/mc.xsd b/.github/skills/pptx/scripts/office/schemas/mce/mc.xsd new file mode 100644 index 0000000..ef72545 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd new file mode 100644 index 0000000..f65f777 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd new file mode 100644 index 0000000..6b00755 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd new file mode 100644 index 0000000..f321d33 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 0000000..364c6a9 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 0000000..fed9d15 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 0000000..680cf15 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/.github/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 0000000..89ada90 --- /dev/null +++ b/.github/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/.github/skills/pptx/scripts/office/soffice.py b/.github/skills/pptx/scripts/office/soffice.py new file mode 100644 index 0000000..c7f7e32 --- /dev/null +++ b/.github/skills/pptx/scripts/office/soffice.py @@ -0,0 +1,183 @@ +""" +Helper for running LibreOffice (soffice) in environments where AF_UNIX +sockets may be blocked (e.g., sandboxed VMs). Detects the restriction +at runtime and applies an LD_PRELOAD shim if needed. + +Usage: + from office.soffice import run_soffice, get_soffice_env + + # Option 1 – run soffice directly + result = run_soffice(["--headless", "--convert-to", "pdf", "input.docx"]) + + # Option 2 – get env dict for your own subprocess calls + env = get_soffice_env() + subprocess.run(["soffice", ...], env=env) +""" + +import os +import socket +import subprocess +import tempfile +from pathlib import Path + + +def get_soffice_env() -> dict: + env = os.environ.copy() + env["SAL_USE_VCLPLUGIN"] = "svp" + + if _needs_shim(): + shim = _ensure_shim() + env["LD_PRELOAD"] = str(shim) + + return env + + +def run_soffice(args: list[str], **kwargs) -> subprocess.CompletedProcess: + env = get_soffice_env() + return subprocess.run(["soffice"] + args, env=env, **kwargs) + + + +_SHIM_SO = Path(tempfile.gettempdir()) / "lo_socket_shim.so" + + +def _needs_shim() -> bool: + try: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.close() + return False + except OSError: + return True + + +def _ensure_shim() -> Path: + if _SHIM_SO.exists(): + return _SHIM_SO + + src = Path(tempfile.gettempdir()) / "lo_socket_shim.c" + src.write_text(_SHIM_SOURCE) + subprocess.run( + ["gcc", "-shared", "-fPIC", "-o", str(_SHIM_SO), str(src), "-ldl"], + check=True, + capture_output=True, + ) + src.unlink() + return _SHIM_SO + + + +_SHIM_SOURCE = r""" +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +static int (*real_socket)(int, int, int); +static int (*real_socketpair)(int, int, int, int[2]); +static int (*real_listen)(int, int); +static int (*real_accept)(int, struct sockaddr *, socklen_t *); +static int (*real_close)(int); +static int (*real_read)(int, void *, size_t); + +/* Per-FD bookkeeping (FDs >= 1024 are passed through unshimmed). */ +static int is_shimmed[1024]; +static int peer_of[1024]; +static int wake_r[1024]; /* accept() blocks reading this */ +static int wake_w[1024]; /* close() writes to this */ +static int listener_fd = -1; /* FD that received listen() */ + +__attribute__((constructor)) +static void init(void) { + real_socket = dlsym(RTLD_NEXT, "socket"); + real_socketpair = dlsym(RTLD_NEXT, "socketpair"); + real_listen = dlsym(RTLD_NEXT, "listen"); + real_accept = dlsym(RTLD_NEXT, "accept"); + real_close = dlsym(RTLD_NEXT, "close"); + real_read = dlsym(RTLD_NEXT, "read"); + for (int i = 0; i < 1024; i++) { + peer_of[i] = -1; + wake_r[i] = -1; + wake_w[i] = -1; + } +} + +/* ---- socket ---------------------------------------------------------- */ +int socket(int domain, int type, int protocol) { + if (domain == AF_UNIX) { + int fd = real_socket(domain, type, protocol); + if (fd >= 0) return fd; + /* socket(AF_UNIX) blocked – fall back to socketpair(). */ + int sv[2]; + if (real_socketpair(domain, type, protocol, sv) == 0) { + if (sv[0] >= 0 && sv[0] < 1024) { + is_shimmed[sv[0]] = 1; + peer_of[sv[0]] = sv[1]; + int wp[2]; + if (pipe(wp) == 0) { + wake_r[sv[0]] = wp[0]; + wake_w[sv[0]] = wp[1]; + } + } + return sv[0]; + } + errno = EPERM; + return -1; + } + return real_socket(domain, type, protocol); +} + +/* ---- listen ---------------------------------------------------------- */ +int listen(int sockfd, int backlog) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + listener_fd = sockfd; + return 0; + } + return real_listen(sockfd, backlog); +} + +/* ---- accept ---------------------------------------------------------- */ +int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + /* Block until close() writes to the wake pipe. */ + if (wake_r[sockfd] >= 0) { + char buf; + real_read(wake_r[sockfd], &buf, 1); + } + errno = ECONNABORTED; + return -1; + } + return real_accept(sockfd, addr, addrlen); +} + +/* ---- close ----------------------------------------------------------- */ +int close(int fd) { + if (fd >= 0 && fd < 1024 && is_shimmed[fd]) { + int was_listener = (fd == listener_fd); + is_shimmed[fd] = 0; + + if (wake_w[fd] >= 0) { /* unblock accept() */ + char c = 0; + write(wake_w[fd], &c, 1); + real_close(wake_w[fd]); + wake_w[fd] = -1; + } + if (wake_r[fd] >= 0) { real_close(wake_r[fd]); wake_r[fd] = -1; } + if (peer_of[fd] >= 0) { real_close(peer_of[fd]); peer_of[fd] = -1; } + + if (was_listener) + _exit(0); /* conversion done – exit */ + } + return real_close(fd); +} +""" + + + +if __name__ == "__main__": + import sys + result = run_soffice(sys.argv[1:]) + sys.exit(result.returncode) diff --git a/.github/skills/pptx/scripts/office/unpack.py b/.github/skills/pptx/scripts/office/unpack.py new file mode 100644 index 0000000..0015253 --- /dev/null +++ b/.github/skills/pptx/scripts/office/unpack.py @@ -0,0 +1,132 @@ +"""Unpack Office files (DOCX, PPTX, XLSX) for editing. + +Extracts the ZIP archive, pretty-prints XML files, and optionally: +- Merges adjacent runs with identical formatting (DOCX only) +- Simplifies adjacent tracked changes from same author (DOCX only) + +Usage: + python unpack.py [options] + +Examples: + python unpack.py document.docx unpacked/ + python unpack.py presentation.pptx unpacked/ + python unpack.py document.docx unpacked/ --merge-runs false +""" + +import argparse +import sys +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from helpers.merge_runs import merge_runs as do_merge_runs +from helpers.simplify_redlines import simplify_redlines as do_simplify_redlines + +SMART_QUOTE_REPLACEMENTS = { + "\u201c": "“", + "\u201d": "”", + "\u2018": "‘", + "\u2019": "’", +} + + +def unpack( + input_file: str, + output_directory: str, + merge_runs: bool = True, + simplify_redlines: bool = True, +) -> tuple[None, str]: + input_path = Path(input_file) + output_path = Path(output_directory) + suffix = input_path.suffix.lower() + + if not input_path.exists(): + return None, f"Error: {input_file} does not exist" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {input_file} must be a .docx, .pptx, or .xlsx file" + + try: + output_path.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(input_path, "r") as zf: + zf.extractall(output_path) + + xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) + for xml_file in xml_files: + _pretty_print_xml(xml_file) + + message = f"Unpacked {input_file} ({len(xml_files)} XML files)" + + if suffix == ".docx": + if simplify_redlines: + simplify_count, _ = do_simplify_redlines(str(output_path)) + message += f", simplified {simplify_count} tracked changes" + + if merge_runs: + merge_count, _ = do_merge_runs(str(output_path)) + message += f", merged {merge_count} runs" + + for xml_file in xml_files: + _escape_smart_quotes(xml_file) + + return None, message + + except zipfile.BadZipFile: + return None, f"Error: {input_file} is not a valid Office file" + except Exception as e: + return None, f"Error unpacking: {e}" + + +def _pretty_print_xml(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="utf-8")) + except Exception: + pass + + +def _escape_smart_quotes(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + for char, entity in SMART_QUOTE_REPLACEMENTS.items(): + content = content.replace(char, entity) + xml_file.write_text(content, encoding="utf-8") + except Exception: + pass + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Unpack an Office file (DOCX, PPTX, XLSX) for editing" + ) + parser.add_argument("input_file", help="Office file to unpack") + parser.add_argument("output_directory", help="Output directory") + parser.add_argument( + "--merge-runs", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent runs with identical formatting (DOCX only, default: true)", + ) + parser.add_argument( + "--simplify-redlines", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent tracked changes from same author (DOCX only, default: true)", + ) + args = parser.parse_args() + + _, message = unpack( + args.input_file, + args.output_directory, + merge_runs=args.merge_runs, + simplify_redlines=args.simplify_redlines, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/.github/skills/pptx/scripts/office/validate.py b/.github/skills/pptx/scripts/office/validate.py new file mode 100644 index 0000000..03b01f6 --- /dev/null +++ b/.github/skills/pptx/scripts/office/validate.py @@ -0,0 +1,111 @@ +""" +Command line tool to validate Office document XML files against XSD schemas and tracked changes. + +Usage: + python validate.py [--original ] [--auto-repair] [--author NAME] + +The first argument can be either: +- An unpacked directory containing the Office document XML files +- A packed Office file (.docx/.pptx/.xlsx) which will be unpacked to a temp directory + +Auto-repair fixes: +- paraId/durableId values that exceed OOXML limits +- Missing xml:space="preserve" on w:t elements with whitespace +""" + +import argparse +import sys +import tempfile +import zipfile +from pathlib import Path + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Validate Office document XML files") + parser.add_argument( + "path", + help="Path to unpacked directory or packed Office file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "--original", + required=False, + default=None, + help="Path to original file (.docx/.pptx/.xlsx). If omitted, all XSD errors are reported and redlining validation is skipped.", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--auto-repair", + action="store_true", + help="Automatically repair common issues (hex IDs, whitespace preservation)", + ) + parser.add_argument( + "--author", + default="Claude", + help="Author name for redlining validation (default: Claude)", + ) + args = parser.parse_args() + + path = Path(args.path) + assert path.exists(), f"Error: {path} does not exist" + + original_file = None + if args.original: + original_file = Path(args.original) + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert original_file.suffix.lower() in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + file_extension = (original_file or path).suffix.lower() + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: Cannot determine file type from {path}. Use --original or provide a .docx/.pptx/.xlsx file." + ) + + if path.is_file() and path.suffix.lower() in [".docx", ".pptx", ".xlsx"]: + temp_dir = tempfile.mkdtemp() + with zipfile.ZipFile(path, "r") as zf: + zf.extractall(temp_dir) + unpacked_dir = Path(temp_dir) + else: + assert path.is_dir(), f"Error: {path} is not a directory or Office file" + unpacked_dir = path + + match file_extension: + case ".docx": + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + if original_file: + validators.append( + RedliningValidator(unpacked_dir, original_file, verbose=args.verbose, author=args.author) + ) + case ".pptx": + validators = [ + PPTXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + if args.auto_repair: + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + print(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/.github/skills/pptx/scripts/office/validators/__init__.py b/.github/skills/pptx/scripts/office/validators/__init__.py new file mode 100644 index 0000000..db092ec --- /dev/null +++ b/.github/skills/pptx/scripts/office/validators/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/.github/skills/pptx/scripts/office/validators/base.py b/.github/skills/pptx/scripts/office/validators/base.py new file mode 100644 index 0000000..db4a06a --- /dev/null +++ b/.github/skills/pptx/scripts/office/validators/base.py @@ -0,0 +1,847 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import defusedxml.minidom +import lxml.etree + + +class BaseSchemaValidator: + + IGNORED_VALIDATION_ERRORS = [ + "hyphenationZone", + "purl.org/dc/terms", + ] + + UNIQUE_ID_REQUIREMENTS = { + "comment": ("id", "file"), + "commentrangestart": ("id", "file"), + "commentrangeend": ("id", "file"), + "bookmarkstart": ("id", "file"), + "bookmarkend": ("id", "file"), + "sldid": ("id", "file"), + "sldmasterid": ("id", "global"), + "sldlayoutid": ("id", "global"), + "cm": ("authorid", "file"), + "sheet": ("sheetid", "file"), + "definedname": ("id", "file"), + "cxnsp": ("id", "file"), + "sp": ("id", "file"), + "pic": ("id", "file"), + "grpsp": ("id", "file"), + } + + EXCLUDED_ID_CONTAINERS = { + "sectionlst", + } + + ELEMENT_RELATIONSHIP_TYPES = {} + + SCHEMA_MAPPINGS = { + "word": "ISO-IEC29500-4_2016/wml.xsd", + "ppt": "ISO-IEC29500-4_2016/pml.xsd", + "xl": "ISO-IEC29500-4_2016/sml.xsd", + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file=None, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) if original_file else None + self.verbose = verbose + + self.schemas_dir = Path(__file__).parent.parent / "schemas" + + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + raise NotImplementedError("Subclasses must implement the validate method") + + def repair(self) -> int: + return self.repair_whitespace_preservation() + + def repair_whitespace_preservation(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if elem.tagName.endswith(":t") and elem.firstChild: + text = elem.firstChild.nodeValue + if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))): + if elem.getAttribute("xml:space") != "preserve": + elem.setAttribute("xml:space", "preserve") + text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text) + print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}") + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + def validate_xml(self): + errors = [] + + for xml_file in self.xml_files: + try: + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + errors = [] + global_ids = {} + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} + + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + for elem in root.iter(): + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + if tag in self.UNIQUE_ID_REQUIREMENTS: + in_excluded_container = any( + ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS + for ancestor in elem.iterancestors() + ) + if in_excluded_container: + continue + + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + errors = [] + + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): + all_files.append(file_path.resolve()) + + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + for rels_file in rels_files: + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + rels_dir = rels_file.parent + + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): + if target.startswith("/"): + target_path = self.unpacked_dir / target.lstrip("/") + elif rels_file.name == ".rels": + target_path = self.unpacked_dir / target + else: + base_dir = rels_dir.parent + target_path = base_dir / target + + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + import lxml.etree + + errors = [] + + for xml_file in self.xml_files: + if xml_file.suffix == ".rels": + continue + + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + if not rels_file.exists(): + continue + + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE + rid_attrs_to_check = ["id", "embed", "link"] + for elem in xml_root.iter(): + for attr_name in rid_attrs_to_check: + rid_attr = elem.get(f"{{{r_ns}}}{attr_name}") + if not rid_attr: + continue + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + elem_lower = element_name.lower() + + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + if elem_lower.endswith("id") and len(elem_lower) > 2: + prefix = elem_lower[:-2] + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + if prefix == "sld": + return "slide" + return prefix.lower() + + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] + return prefix.lower() + + return None + + def validate_content_types(self): + errors = [] + + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", + "document", + "workbook", + "worksheet", + "theme", + } + + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue + + for file_path in all_files: + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() + elif is_valid: + return True, set() + + original_errors = self._get_original_file_errors(xml_file) + + assert current_errors is not None + new_errors = current_errors - original_errors + + new_errors = { + e for e in new_errors + if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS) + } + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + original_error_count += 1 + valid_count += 1 + continue + + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + for attr in attrs_to_remove: + del elem.attrib[attr] + + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + elements_to_remove = [] + + for elem in list(root): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + self._remove_ignorable_elements(elem) + + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + root = xml_doc.getroot() + + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None + + try: + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + if self.original_file is None: + return set() + + import tempfile + import zipfile + + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + return set() + + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + for elem in xml_copy.iter(): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/pptx/scripts/office/validators/docx.py b/.github/skills/pptx/scripts/office/validators/docx.py new file mode 100644 index 0000000..fec405e --- /dev/null +++ b/.github/skills/pptx/scripts/office/validators/docx.py @@ -0,0 +1,446 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import random +import re +import tempfile +import zipfile + +import defusedxml.minidom +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml" + W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid" + + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_whitespace_preservation(): + all_valid = False + + if not self.validate_deletions(): + all_valid = False + + if not self.validate_insertions(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_id_constraints(): + all_valid = False + + if not self.validate_comment_markers(): + all_valid = False + + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + if re.search(r"^[ \t\n\r]", text) or re.search( + r"[ \t\n\r]$", text + ): + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces): + if t_elem.text: + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + for instr_elem in root.xpath( + ".//w:del//w:instrText", namespaces=namespaces + ): + text_preview = ( + repr(instr_elem.text or "")[:50] + "..." + if len(repr(instr_elem.text or "")) > 50 + else repr(instr_elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {instr_elem.sourceline}: found within (use ): {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + count = 0 + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + original = self.original_file + if original is None: + return 0 + + count = 0 + + try: + with tempfile.TemporaryDirectory() as temp_dir: + with zipfile.ZipFile(original, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + def _parse_id_value(self, val: str, base: int = 16) -> int: + return int(val, base) + + def validate_id_constraints(self): + errors = [] + para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId" + durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId" + + for xml_file in self.xml_files: + try: + for elem in lxml.etree.parse(str(xml_file)).iter(): + if val := elem.get(para_id_attr): + if self._parse_id_value(val, base=16) >= 0x80000000: + errors.append( + f" {xml_file.name}:{elem.sourceline}: paraId={val} >= 0x80000000" + ) + + if val := elem.get(durable_id_attr): + if xml_file.name == "numbering.xml": + try: + if self._parse_id_value(val, base=10) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except ValueError: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} must be decimal in numbering.xml" + ) + else: + if self._parse_id_value(val, base=16) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except Exception: + pass + + if errors: + print(f"FAILED - {len(errors)} ID constraint violations:") + for e in errors: + print(e) + elif self.verbose: + print("PASSED - All paraId/durableId values within constraints") + return not errors + + def validate_comment_markers(self): + errors = [] + + document_xml = None + comments_xml = None + for xml_file in self.xml_files: + if xml_file.name == "document.xml" and "word" in str(xml_file): + document_xml = xml_file + elif xml_file.name == "comments.xml": + comments_xml = xml_file + + if not document_xml: + if self.verbose: + print("PASSED - No document.xml found (skipping comment validation)") + return True + + try: + doc_root = lxml.etree.parse(str(document_xml)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + range_starts = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeStart", namespaces=namespaces + ) + } + range_ends = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeEnd", namespaces=namespaces + ) + } + references = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentReference", namespaces=namespaces + ) + } + + orphaned_ends = range_ends - range_starts + for comment_id in sorted( + orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart' + ) + + orphaned_starts = range_starts - range_ends + for comment_id in sorted( + orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd' + ) + + comment_ids = set() + if comments_xml and comments_xml.exists(): + comments_root = lxml.etree.parse(str(comments_xml)).getroot() + comment_ids = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in comments_root.xpath( + ".//w:comment", namespaces=namespaces + ) + } + + marker_ids = range_starts | range_ends | references + invalid_refs = marker_ids - comment_ids + for comment_id in sorted( + invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + if comment_id: + errors.append( + f' document.xml: marker id="{comment_id}" references non-existent comment' + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append(f" Error parsing XML: {e}") + + if errors: + print(f"FAILED - {len(errors)} comment marker violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All comment markers properly paired") + return True + + def repair(self) -> int: + repairs = super().repair() + repairs += self.repair_durableId() + return repairs + + def repair_durableId(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if not elem.hasAttribute("w16cid:durableId"): + continue + + durable_id = elem.getAttribute("w16cid:durableId") + needs_repair = False + + if xml_file.name == "numbering.xml": + try: + needs_repair = ( + self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + else: + try: + needs_repair = ( + self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + + if needs_repair: + value = random.randint(1, 0x7FFFFFFE) + if xml_file.name == "numbering.xml": + new_id = str(value) + else: + new_id = f"{value:08X}" + + elem.setAttribute("w16cid:durableId", new_id) + print( + f" Repaired: {xml_file.name}: durableId {durable_id} → {new_id}" + ) + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/pptx/scripts/office/validators/pptx.py b/.github/skills/pptx/scripts/office/validators/pptx.py new file mode 100644 index 0000000..09842aa --- /dev/null +++ b/.github/skills/pptx/scripts/office/validators/pptx.py @@ -0,0 +1,275 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_uuid_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_slide_layout_ids(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_notes_slide_references(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + import lxml.etree + + errors = [] + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(): + for attr, value in elem.attrib.items(): + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + if self._looks_like_uuid(value): + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + clean_value = value.strip("{}()").replace("-", "") + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + import lxml.etree + + errors = [] + + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + root = lxml.etree.parse(str(slide_master)).getroot() + + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + import lxml.etree + + errors = [] + notes_slide_references = {} + + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + normalized_target = target.replace("../", "") + + slide_name = rels_file.stem.replace( + ".xml", "" + ) + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/pptx/scripts/office/validators/redlining.py b/.github/skills/pptx/scripts/office/validators/redlining.py new file mode 100644 index 0000000..71c81b6 --- /dev/null +++ b/.github/skills/pptx/scripts/office/validators/redlining.py @@ -0,0 +1,247 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + + def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.author = author + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def repair(self) -> int: + return 0 + + def validate(self): + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + author_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + author_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + + if not author_del_elements and not author_ins_elements: + if self.verbose: + print(f"PASSED - No tracked changes by {self.author} found.") + return True + + except Exception: + pass + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + self._remove_author_tracked_changes(original_root) + self._remove_author_tracked_changes(modified_root) + + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print(f"PASSED - All changes by {self.author} are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + error_parts = [ + f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + pass + + return None + + def _remove_author_tracked_changes(self, root): + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == self.author: + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == self.author: + to_process.append((child, list(parent).index(child))) + + for del_elem, del_index in reversed(to_process): + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/pptx/scripts/thumbnail.py b/.github/skills/pptx/scripts/thumbnail.py new file mode 100644 index 0000000..edcbdc0 --- /dev/null +++ b/.github/skills/pptx/scripts/thumbnail.py @@ -0,0 +1,289 @@ +"""Create thumbnail grids from PowerPoint presentation slides. + +Creates a grid layout of slide thumbnails for quick visual analysis. +Labels each thumbnail with its XML filename (e.g., slide1.xml). +Hidden slides are shown with a placeholder pattern. + +Usage: + python thumbnail.py input.pptx [output_prefix] [--cols N] + +Examples: + python thumbnail.py presentation.pptx + # Creates: thumbnails.jpg + + python thumbnail.py template.pptx grid --cols 4 + # Creates: grid.jpg (or grid-1.jpg, grid-2.jpg for large decks) +""" + +import argparse +import subprocess +import sys +import tempfile +import zipfile +from pathlib import Path + +import defusedxml.minidom +from office.soffice import get_soffice_env +from PIL import Image, ImageDraw, ImageFont + +THUMBNAIL_WIDTH = 300 +CONVERSION_DPI = 100 +MAX_COLS = 6 +DEFAULT_COLS = 3 +JPEG_QUALITY = 95 +GRID_PADDING = 20 +BORDER_WIDTH = 2 +FONT_SIZE_RATIO = 0.10 +LABEL_PADDING_RATIO = 0.4 + + +def main(): + parser = argparse.ArgumentParser( + description="Create thumbnail grids from PowerPoint slides." + ) + parser.add_argument("input", help="Input PowerPoint file (.pptx)") + parser.add_argument( + "output_prefix", + nargs="?", + default="thumbnails", + help="Output prefix for image files (default: thumbnails)", + ) + parser.add_argument( + "--cols", + type=int, + default=DEFAULT_COLS, + help=f"Number of columns (default: {DEFAULT_COLS}, max: {MAX_COLS})", + ) + + args = parser.parse_args() + + cols = min(args.cols, MAX_COLS) + if args.cols > MAX_COLS: + print(f"Warning: Columns limited to {MAX_COLS}") + + input_path = Path(args.input) + if not input_path.exists() or input_path.suffix.lower() != ".pptx": + print(f"Error: Invalid PowerPoint file: {args.input}", file=sys.stderr) + sys.exit(1) + + output_path = Path(f"{args.output_prefix}.jpg") + + try: + slide_info = get_slide_info(input_path) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + visible_images = convert_to_images(input_path, temp_path) + + if not visible_images and not any(s["hidden"] for s in slide_info): + print("Error: No slides found", file=sys.stderr) + sys.exit(1) + + slides = build_slide_list(slide_info, visible_images, temp_path) + + grid_files = create_grids(slides, cols, THUMBNAIL_WIDTH, output_path) + + print(f"Created {len(grid_files)} grid(s):") + for grid_file in grid_files: + print(f" {grid_file}") + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +def get_slide_info(pptx_path: Path) -> list[dict]: + with zipfile.ZipFile(pptx_path, "r") as zf: + rels_content = zf.read("ppt/_rels/presentation.xml.rels").decode("utf-8") + rels_dom = defusedxml.minidom.parseString(rels_content) + + rid_to_slide = {} + for rel in rels_dom.getElementsByTagName("Relationship"): + rid = rel.getAttribute("Id") + target = rel.getAttribute("Target") + rel_type = rel.getAttribute("Type") + if "slide" in rel_type and target.startswith("slides/"): + rid_to_slide[rid] = target.replace("slides/", "") + + pres_content = zf.read("ppt/presentation.xml").decode("utf-8") + pres_dom = defusedxml.minidom.parseString(pres_content) + + slides = [] + for sld_id in pres_dom.getElementsByTagName("p:sldId"): + rid = sld_id.getAttribute("r:id") + if rid in rid_to_slide: + hidden = sld_id.getAttribute("show") == "0" + slides.append({"name": rid_to_slide[rid], "hidden": hidden}) + + return slides + + +def build_slide_list( + slide_info: list[dict], + visible_images: list[Path], + temp_dir: Path, +) -> list[tuple[Path, str]]: + if visible_images: + with Image.open(visible_images[0]) as img: + placeholder_size = img.size + else: + placeholder_size = (1920, 1080) + + slides = [] + visible_idx = 0 + + for info in slide_info: + if info["hidden"]: + placeholder_path = temp_dir / f"hidden-{info['name']}.jpg" + placeholder_img = create_hidden_placeholder(placeholder_size) + placeholder_img.save(placeholder_path, "JPEG") + slides.append((placeholder_path, f"{info['name']} (hidden)")) + else: + if visible_idx < len(visible_images): + slides.append((visible_images[visible_idx], info["name"])) + visible_idx += 1 + + return slides + + +def create_hidden_placeholder(size: tuple[int, int]) -> Image.Image: + img = Image.new("RGB", size, color="#F0F0F0") + draw = ImageDraw.Draw(img) + line_width = max(5, min(size) // 100) + draw.line([(0, 0), size], fill="#CCCCCC", width=line_width) + draw.line([(size[0], 0), (0, size[1])], fill="#CCCCCC", width=line_width) + return img + + +def convert_to_images(pptx_path: Path, temp_dir: Path) -> list[Path]: + pdf_path = temp_dir / f"{pptx_path.stem}.pdf" + + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + "pdf", + "--outdir", + str(temp_dir), + str(pptx_path), + ], + capture_output=True, + text=True, + env=get_soffice_env(), + ) + if result.returncode != 0 or not pdf_path.exists(): + raise RuntimeError("PDF conversion failed") + + result = subprocess.run( + [ + "pdftoppm", + "-jpeg", + "-r", + str(CONVERSION_DPI), + str(pdf_path), + str(temp_dir / "slide"), + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError("Image conversion failed") + + return sorted(temp_dir.glob("slide-*.jpg")) + + +def create_grids( + slides: list[tuple[Path, str]], + cols: int, + width: int, + output_path: Path, +) -> list[str]: + max_per_grid = cols * (cols + 1) + grid_files = [] + + for chunk_idx, start_idx in enumerate(range(0, len(slides), max_per_grid)): + end_idx = min(start_idx + max_per_grid, len(slides)) + chunk_slides = slides[start_idx:end_idx] + + grid = create_grid(chunk_slides, cols, width) + + if len(slides) <= max_per_grid: + grid_filename = output_path + else: + stem = output_path.stem + suffix = output_path.suffix + grid_filename = output_path.parent / f"{stem}-{chunk_idx + 1}{suffix}" + + grid_filename.parent.mkdir(parents=True, exist_ok=True) + grid.save(str(grid_filename), quality=JPEG_QUALITY) + grid_files.append(str(grid_filename)) + + return grid_files + + +def create_grid( + slides: list[tuple[Path, str]], + cols: int, + width: int, +) -> Image.Image: + font_size = int(width * FONT_SIZE_RATIO) + label_padding = int(font_size * LABEL_PADDING_RATIO) + + with Image.open(slides[0][0]) as img: + aspect = img.height / img.width + height = int(width * aspect) + + rows = (len(slides) + cols - 1) // cols + grid_w = cols * width + (cols + 1) * GRID_PADDING + grid_h = rows * (height + font_size + label_padding * 2) + (rows + 1) * GRID_PADDING + + grid = Image.new("RGB", (grid_w, grid_h), "white") + draw = ImageDraw.Draw(grid) + + try: + font = ImageFont.load_default(size=font_size) + except Exception: + font = ImageFont.load_default() + + for i, (img_path, slide_name) in enumerate(slides): + row, col = i // cols, i % cols + x = col * width + (col + 1) * GRID_PADDING + y_base = ( + row * (height + font_size + label_padding * 2) + (row + 1) * GRID_PADDING + ) + + label = slide_name + bbox = draw.textbbox((0, 0), label, font=font) + text_w = bbox[2] - bbox[0] + draw.text( + (x + (width - text_w) // 2, y_base + label_padding), + label, + fill="black", + font=font, + ) + + y_thumbnail = y_base + label_padding + font_size + label_padding + + with Image.open(img_path) as img: + img.thumbnail((width, height), Image.Resampling.LANCZOS) + w, h = img.size + tx = x + (width - w) // 2 + ty = y_thumbnail + (height - h) // 2 + grid.paste(img, (tx, ty)) + + if BORDER_WIDTH > 0: + draw.rectangle( + [ + (tx - BORDER_WIDTH, ty - BORDER_WIDTH), + (tx + w + BORDER_WIDTH - 1, ty + h + BORDER_WIDTH - 1), + ], + outline="gray", + width=BORDER_WIDTH, + ) + + return grid + + +if __name__ == "__main__": + main() diff --git a/.github/skills/xlsx/LICENSE.txt b/.github/skills/xlsx/LICENSE.txt new file mode 100644 index 0000000..c55ab42 --- /dev/null +++ b/.github/skills/xlsx/LICENSE.txt @@ -0,0 +1,30 @@ +© 2025 Anthropic, PBC. All rights reserved. + +LICENSE: Use of these materials (including all code, prompts, assets, files, +and other components of this Skill) is governed by your agreement with +Anthropic regarding use of Anthropic's services. If no separate agreement +exists, use is governed by Anthropic's Consumer Terms of Service or +Commercial Terms of Service, as applicable: +https://www.anthropic.com/legal/consumer-terms +https://www.anthropic.com/legal/commercial-terms +Your applicable agreement is referred to as the "Agreement." "Services" are +as defined in the Agreement. + +ADDITIONAL RESTRICTIONS: Notwithstanding anything in the Agreement to the +contrary, users may not: + +- Extract these materials from the Services or retain copies of these + materials outside the Services +- Reproduce or copy these materials, except for temporary copies created + automatically during authorized use of the Services +- Create derivative works based on these materials +- Distribute, sublicense, or transfer these materials to any third party +- Make, offer to sell, sell, or import any inventions embodied in these + materials +- Reverse engineer, decompile, or disassemble these materials + +The receipt, viewing, or possession of these materials does not convey or +imply any license or right beyond those expressly granted above. + +Anthropic retains all right, title, and interest in these materials, +including all copyrights, patents, and other intellectual property rights. diff --git a/.github/skills/xlsx/SKILL.md b/.github/skills/xlsx/SKILL.md new file mode 100644 index 0000000..c5c881b --- /dev/null +++ b/.github/skills/xlsx/SKILL.md @@ -0,0 +1,292 @@ +--- +name: xlsx +description: "Use this skill any time a spreadsheet file is the primary input or output. This means any task where the user wants to: open, read, edit, or fix an existing .xlsx, .xlsm, .csv, or .tsv file (e.g., adding columns, computing formulas, formatting, charting, cleaning messy data); create a new spreadsheet from scratch or from other data sources; or convert between tabular file formats. Trigger especially when the user references a spreadsheet file by name or path — even casually (like \"the xlsx in my downloads\") — and wants something done to it or produced from it. Also trigger for cleaning or restructuring messy tabular data files (malformed rows, misplaced headers, junk data) into proper spreadsheets. The deliverable must be a spreadsheet file. Do NOT trigger when the primary deliverable is a Word document, HTML report, standalone Python script, database pipeline, or Google Sheets API integration, even if tabular data is involved." +license: Proprietary. LICENSE.txt has complete terms +--- + +# Requirements for Outputs + +## All Excel files + +### Professional Font +- Use a consistent, professional font (e.g., Arial, Times New Roman) for all deliverables unless otherwise instructed by the user + +### Zero Formula Errors +- Every Excel model MUST be delivered with ZERO formula errors (#REF!, #DIV/0!, #VALUE!, #N/A, #NAME?) + +### Preserve Existing Templates (when updating templates) +- Study and EXACTLY match existing format, style, and conventions when modifying files +- Never impose standardized formatting on files with established patterns +- Existing template conventions ALWAYS override these guidelines + +## Financial models + +### Color Coding Standards +Unless otherwise stated by the user or existing template + +#### Industry-Standard Color Conventions +- **Blue text (RGB: 0,0,255)**: Hardcoded inputs, and numbers users will change for scenarios +- **Black text (RGB: 0,0,0)**: ALL formulas and calculations +- **Green text (RGB: 0,128,0)**: Links pulling from other worksheets within same workbook +- **Red text (RGB: 255,0,0)**: External links to other files +- **Yellow background (RGB: 255,255,0)**: Key assumptions needing attention or cells that need to be updated + +### Number Formatting Standards + +#### Required Format Rules +- **Years**: Format as text strings (e.g., "2024" not "2,024") +- **Currency**: Use $#,##0 format; ALWAYS specify units in headers ("Revenue ($mm)") +- **Zeros**: Use number formatting to make all zeros "-", including percentages (e.g., "$#,##0;($#,##0);-") +- **Percentages**: Default to 0.0% format (one decimal) +- **Multiples**: Format as 0.0x for valuation multiples (EV/EBITDA, P/E) +- **Negative numbers**: Use parentheses (123) not minus -123 + +### Formula Construction Rules + +#### Assumptions Placement +- Place ALL assumptions (growth rates, margins, multiples, etc.) in separate assumption cells +- Use cell references instead of hardcoded values in formulas +- Example: Use =B5*(1+$B$6) instead of =B5*1.05 + +#### Formula Error Prevention +- Verify all cell references are correct +- Check for off-by-one errors in ranges +- Ensure consistent formulas across all projection periods +- Test with edge cases (zero values, negative numbers) +- Verify no unintended circular references + +#### Documentation Requirements for Hardcodes +- Comment or in cells beside (if end of table). Format: "Source: [System/Document], [Date], [Specific Reference], [URL if applicable]" +- Examples: + - "Source: Company 10-K, FY2024, Page 45, Revenue Note, [SEC EDGAR URL]" + - "Source: Company 10-Q, Q2 2025, Exhibit 99.1, [SEC EDGAR URL]" + - "Source: Bloomberg Terminal, 8/15/2025, AAPL US Equity" + - "Source: FactSet, 8/20/2025, Consensus Estimates Screen" + +# XLSX creation, editing, and analysis + +## Overview + +A user may ask you to create, edit, or analyze the contents of an .xlsx file. You have different tools and workflows available for different tasks. + +## Important Requirements + +**LibreOffice Required for Formula Recalculation**: You can assume LibreOffice is installed for recalculating formula values using the `scripts/recalc.py` script. The script automatically configures LibreOffice on first run, including in sandboxed environments where Unix sockets are restricted (handled by `scripts/office/soffice.py`) + +## Reading and analyzing data + +### Data analysis with pandas +For data analysis, visualization, and basic operations, use **pandas** which provides powerful data manipulation capabilities: + +```python +import pandas as pd + +# Read Excel +df = pd.read_excel('file.xlsx') # Default: first sheet +all_sheets = pd.read_excel('file.xlsx', sheet_name=None) # All sheets as dict + +# Analyze +df.head() # Preview data +df.info() # Column info +df.describe() # Statistics + +# Write Excel +df.to_excel('output.xlsx', index=False) +``` + +## Excel File Workflows + +## CRITICAL: Use Formulas, Not Hardcoded Values + +**Always use Excel formulas instead of calculating values in Python and hardcoding them.** This ensures the spreadsheet remains dynamic and updateable. + +### ❌ WRONG - Hardcoding Calculated Values +```python +# Bad: Calculating in Python and hardcoding result +total = df['Sales'].sum() +sheet['B10'] = total # Hardcodes 5000 + +# Bad: Computing growth rate in Python +growth = (df.iloc[-1]['Revenue'] - df.iloc[0]['Revenue']) / df.iloc[0]['Revenue'] +sheet['C5'] = growth # Hardcodes 0.15 + +# Bad: Python calculation for average +avg = sum(values) / len(values) +sheet['D20'] = avg # Hardcodes 42.5 +``` + +### ✅ CORRECT - Using Excel Formulas +```python +# Good: Let Excel calculate the sum +sheet['B10'] = '=SUM(B2:B9)' + +# Good: Growth rate as Excel formula +sheet['C5'] = '=(C4-C2)/C2' + +# Good: Average using Excel function +sheet['D20'] = '=AVERAGE(D2:D19)' +``` + +This applies to ALL calculations - totals, percentages, ratios, differences, etc. The spreadsheet should be able to recalculate when source data changes. + +## Common Workflow +1. **Choose tool**: pandas for data, openpyxl for formulas/formatting +2. **Create/Load**: Create new workbook or load existing file +3. **Modify**: Add/edit data, formulas, and formatting +4. **Save**: Write to file +5. **Recalculate formulas (MANDATORY IF USING FORMULAS)**: Use the scripts/recalc.py script + ```bash + python scripts/recalc.py output.xlsx + ``` +6. **Verify and fix any errors**: + - The script returns JSON with error details + - If `status` is `errors_found`, check `error_summary` for specific error types and locations + - Fix the identified errors and recalculate again + - Common errors to fix: + - `#REF!`: Invalid cell references + - `#DIV/0!`: Division by zero + - `#VALUE!`: Wrong data type in formula + - `#NAME?`: Unrecognized formula name + +### Creating new Excel files + +```python +# Using openpyxl for formulas and formatting +from openpyxl import Workbook +from openpyxl.styles import Font, PatternFill, Alignment + +wb = Workbook() +sheet = wb.active + +# Add data +sheet['A1'] = 'Hello' +sheet['B1'] = 'World' +sheet.append(['Row', 'of', 'data']) + +# Add formula +sheet['B2'] = '=SUM(A1:A10)' + +# Formatting +sheet['A1'].font = Font(bold=True, color='FF0000') +sheet['A1'].fill = PatternFill('solid', start_color='FFFF00') +sheet['A1'].alignment = Alignment(horizontal='center') + +# Column width +sheet.column_dimensions['A'].width = 20 + +wb.save('output.xlsx') +``` + +### Editing existing Excel files + +```python +# Using openpyxl to preserve formulas and formatting +from openpyxl import load_workbook + +# Load existing file +wb = load_workbook('existing.xlsx') +sheet = wb.active # or wb['SheetName'] for specific sheet + +# Working with multiple sheets +for sheet_name in wb.sheetnames: + sheet = wb[sheet_name] + print(f"Sheet: {sheet_name}") + +# Modify cells +sheet['A1'] = 'New Value' +sheet.insert_rows(2) # Insert row at position 2 +sheet.delete_cols(3) # Delete column 3 + +# Add new sheet +new_sheet = wb.create_sheet('NewSheet') +new_sheet['A1'] = 'Data' + +wb.save('modified.xlsx') +``` + +## Recalculating formulas + +Excel files created or modified by openpyxl contain formulas as strings but not calculated values. Use the provided `scripts/recalc.py` script to recalculate formulas: + +```bash +python scripts/recalc.py [timeout_seconds] +``` + +Example: +```bash +python scripts/recalc.py output.xlsx 30 +``` + +The script: +- Automatically sets up LibreOffice macro on first run +- Recalculates all formulas in all sheets +- Scans ALL cells for Excel errors (#REF!, #DIV/0!, etc.) +- Returns JSON with detailed error locations and counts +- Works on both Linux and macOS + +## Formula Verification Checklist + +Quick checks to ensure formulas work correctly: + +### Essential Verification +- [ ] **Test 2-3 sample references**: Verify they pull correct values before building full model +- [ ] **Column mapping**: Confirm Excel columns match (e.g., column 64 = BL, not BK) +- [ ] **Row offset**: Remember Excel rows are 1-indexed (DataFrame row 5 = Excel row 6) + +### Common Pitfalls +- [ ] **NaN handling**: Check for null values with `pd.notna()` +- [ ] **Far-right columns**: FY data often in columns 50+ +- [ ] **Multiple matches**: Search all occurrences, not just first +- [ ] **Division by zero**: Check denominators before using `/` in formulas (#DIV/0!) +- [ ] **Wrong references**: Verify all cell references point to intended cells (#REF!) +- [ ] **Cross-sheet references**: Use correct format (Sheet1!A1) for linking sheets + +### Formula Testing Strategy +- [ ] **Start small**: Test formulas on 2-3 cells before applying broadly +- [ ] **Verify dependencies**: Check all cells referenced in formulas exist +- [ ] **Test edge cases**: Include zero, negative, and very large values + +### Interpreting scripts/recalc.py Output +The script returns JSON with error details: +```json +{ + "status": "success", // or "errors_found" + "total_errors": 0, // Total error count + "total_formulas": 42, // Number of formulas in file + "error_summary": { // Only present if errors found + "#REF!": { + "count": 2, + "locations": ["Sheet1!B5", "Sheet1!C10"] + } + } +} +``` + +## Best Practices + +### Library Selection +- **pandas**: Best for data analysis, bulk operations, and simple data export +- **openpyxl**: Best for complex formatting, formulas, and Excel-specific features + +### Working with openpyxl +- Cell indices are 1-based (row=1, column=1 refers to cell A1) +- Use `data_only=True` to read calculated values: `load_workbook('file.xlsx', data_only=True)` +- **Warning**: If opened with `data_only=True` and saved, formulas are replaced with values and permanently lost +- For large files: Use `read_only=True` for reading or `write_only=True` for writing +- Formulas are preserved but not evaluated - use scripts/recalc.py to update values + +### Working with pandas +- Specify data types to avoid inference issues: `pd.read_excel('file.xlsx', dtype={'id': str})` +- For large files, read specific columns: `pd.read_excel('file.xlsx', usecols=['A', 'C', 'E'])` +- Handle dates properly: `pd.read_excel('file.xlsx', parse_dates=['date_column'])` + +## Code Style Guidelines +**IMPORTANT**: When generating Python code for Excel operations: +- Write minimal, concise Python code without unnecessary comments +- Avoid verbose variable names and redundant operations +- Avoid unnecessary print statements + +**For Excel files themselves**: +- Add comments to cells with complex formulas or important assumptions +- Document data sources for hardcoded values +- Include notes for key calculations and model sections \ No newline at end of file diff --git a/.github/skills/xlsx/scripts/office/helpers/__init__.py b/.github/skills/xlsx/scripts/office/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.github/skills/xlsx/scripts/office/helpers/merge_runs.py b/.github/skills/xlsx/scripts/office/helpers/merge_runs.py new file mode 100644 index 0000000..ad7c25e --- /dev/null +++ b/.github/skills/xlsx/scripts/office/helpers/merge_runs.py @@ -0,0 +1,199 @@ +"""Merge adjacent runs with identical formatting in DOCX. + +Merges adjacent elements that have identical properties. +Works on runs in paragraphs and inside tracked changes (, ). + +Also: +- Removes rsid attributes from runs (revision metadata that doesn't affect rendering) +- Removes proofErr elements (spell/grammar markers that block merging) +""" + +from pathlib import Path + +import defusedxml.minidom + + +def merge_runs(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + _remove_elements(root, "proofErr") + _strip_run_rsid_attrs(root) + + containers = {run.parentNode for run in _find_elements(root, "r")} + + merge_count = 0 + for container in containers: + merge_count += _merge_runs_in(container) + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Merged {merge_count} runs" + + except Exception as e: + return 0, f"Error: {e}" + + + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def _get_child(parent, tag: str): + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + return child + return None + + +def _get_children(parent, tag: str) -> list: + results = [] + for child in parent.childNodes: + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(child) + return results + + +def _is_adjacent(elem1, elem2) -> bool: + node = elem1.nextSibling + while node: + if node == elem2: + return True + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + return False + + + + +def _remove_elements(root, tag: str): + for elem in _find_elements(root, tag): + if elem.parentNode: + elem.parentNode.removeChild(elem) + + +def _strip_run_rsid_attrs(root): + for run in _find_elements(root, "r"): + for attr in list(run.attributes.values()): + if "rsid" in attr.name.lower(): + run.removeAttribute(attr.name) + + + + +def _merge_runs_in(container) -> int: + merge_count = 0 + run = _first_child_run(container) + + while run: + while True: + next_elem = _next_element_sibling(run) + if next_elem and _is_run(next_elem) and _can_merge(run, next_elem): + _merge_run_content(run, next_elem) + container.removeChild(next_elem) + merge_count += 1 + else: + break + + _consolidate_text(run) + run = _next_sibling_run(run) + + return merge_count + + +def _first_child_run(container): + for child in container.childNodes: + if child.nodeType == child.ELEMENT_NODE and _is_run(child): + return child + return None + + +def _next_element_sibling(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + return sibling + sibling = sibling.nextSibling + return None + + +def _next_sibling_run(node): + sibling = node.nextSibling + while sibling: + if sibling.nodeType == sibling.ELEMENT_NODE: + if _is_run(sibling): + return sibling + sibling = sibling.nextSibling + return None + + +def _is_run(node) -> bool: + name = node.localName or node.tagName + return name == "r" or name.endswith(":r") + + +def _can_merge(run1, run2) -> bool: + rpr1 = _get_child(run1, "rPr") + rpr2 = _get_child(run2, "rPr") + + if (rpr1 is None) != (rpr2 is None): + return False + if rpr1 is None: + return True + return rpr1.toxml() == rpr2.toxml() + + +def _merge_run_content(target, source): + for child in list(source.childNodes): + if child.nodeType == child.ELEMENT_NODE: + name = child.localName or child.tagName + if name != "rPr" and not name.endswith(":rPr"): + target.appendChild(child) + + +def _consolidate_text(run): + t_elements = _get_children(run, "t") + + for i in range(len(t_elements) - 1, 0, -1): + curr, prev = t_elements[i], t_elements[i - 1] + + if _is_adjacent(prev, curr): + prev_text = prev.firstChild.data if prev.firstChild else "" + curr_text = curr.firstChild.data if curr.firstChild else "" + merged = prev_text + curr_text + + if prev.firstChild: + prev.firstChild.data = merged + else: + prev.appendChild(run.ownerDocument.createTextNode(merged)) + + if merged.startswith(" ") or merged.endswith(" "): + prev.setAttribute("xml:space", "preserve") + elif prev.hasAttribute("xml:space"): + prev.removeAttribute("xml:space") + + run.removeChild(curr) diff --git a/.github/skills/xlsx/scripts/office/helpers/simplify_redlines.py b/.github/skills/xlsx/scripts/office/helpers/simplify_redlines.py new file mode 100644 index 0000000..db963bb --- /dev/null +++ b/.github/skills/xlsx/scripts/office/helpers/simplify_redlines.py @@ -0,0 +1,197 @@ +"""Simplify tracked changes by merging adjacent w:ins or w:del elements. + +Merges adjacent elements from the same author into a single element. +Same for elements. This makes heavily-redlined documents easier to +work with by reducing the number of tracked change wrappers. + +Rules: +- Only merges w:ins with w:ins, w:del with w:del (same element type) +- Only merges if same author (ignores timestamp differences) +- Only merges if truly adjacent (only whitespace between them) +""" + +import xml.etree.ElementTree as ET +import zipfile +from pathlib import Path + +import defusedxml.minidom + +WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def simplify_redlines(input_dir: str) -> tuple[int, str]: + doc_xml = Path(input_dir) / "word" / "document.xml" + + if not doc_xml.exists(): + return 0, f"Error: {doc_xml} not found" + + try: + dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8")) + root = dom.documentElement + + merge_count = 0 + + containers = _find_elements(root, "p") + _find_elements(root, "tc") + + for container in containers: + merge_count += _merge_tracked_changes_in(container, "ins") + merge_count += _merge_tracked_changes_in(container, "del") + + doc_xml.write_bytes(dom.toxml(encoding="UTF-8")) + return merge_count, f"Simplified {merge_count} tracked changes" + + except Exception as e: + return 0, f"Error: {e}" + + +def _merge_tracked_changes_in(container, tag: str) -> int: + merge_count = 0 + + tracked = [ + child + for child in container.childNodes + if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag) + ] + + if len(tracked) < 2: + return 0 + + i = 0 + while i < len(tracked) - 1: + curr = tracked[i] + next_elem = tracked[i + 1] + + if _can_merge_tracked(curr, next_elem): + _merge_tracked_content(curr, next_elem) + container.removeChild(next_elem) + tracked.pop(i + 1) + merge_count += 1 + else: + i += 1 + + return merge_count + + +def _is_element(node, tag: str) -> bool: + name = node.localName or node.tagName + return name == tag or name.endswith(f":{tag}") + + +def _get_author(elem) -> str: + author = elem.getAttribute("w:author") + if not author: + for attr in elem.attributes.values(): + if attr.localName == "author" or attr.name.endswith(":author"): + return attr.value + return author + + +def _can_merge_tracked(elem1, elem2) -> bool: + if _get_author(elem1) != _get_author(elem2): + return False + + node = elem1.nextSibling + while node and node != elem2: + if node.nodeType == node.ELEMENT_NODE: + return False + if node.nodeType == node.TEXT_NODE and node.data.strip(): + return False + node = node.nextSibling + + return True + + +def _merge_tracked_content(target, source): + while source.firstChild: + child = source.firstChild + source.removeChild(child) + target.appendChild(child) + + +def _find_elements(root, tag: str) -> list: + results = [] + + def traverse(node): + if node.nodeType == node.ELEMENT_NODE: + name = node.localName or node.tagName + if name == tag or name.endswith(f":{tag}"): + results.append(node) + for child in node.childNodes: + traverse(child) + + traverse(root) + return results + + +def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]: + if not doc_xml_path.exists(): + return {} + + try: + tree = ET.parse(doc_xml_path) + root = tree.getroot() + except ET.ParseError: + return {} + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + + return authors + + +def _get_authors_from_docx(docx_path: Path) -> dict[str, int]: + try: + with zipfile.ZipFile(docx_path, "r") as zf: + if "word/document.xml" not in zf.namelist(): + return {} + with zf.open("word/document.xml") as f: + tree = ET.parse(f) + root = tree.getroot() + + namespaces = {"w": WORD_NS} + author_attr = f"{{{WORD_NS}}}author" + + authors: dict[str, int] = {} + for tag in ["ins", "del"]: + for elem in root.findall(f".//w:{tag}", namespaces): + author = elem.get(author_attr) + if author: + authors[author] = authors.get(author, 0) + 1 + return authors + except (zipfile.BadZipFile, ET.ParseError): + return {} + + +def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str: + modified_xml = modified_dir / "word" / "document.xml" + modified_authors = get_tracked_change_authors(modified_xml) + + if not modified_authors: + return default + + original_authors = _get_authors_from_docx(original_docx) + + new_changes: dict[str, int] = {} + for author, count in modified_authors.items(): + original_count = original_authors.get(author, 0) + diff = count - original_count + if diff > 0: + new_changes[author] = diff + + if not new_changes: + return default + + if len(new_changes) == 1: + return next(iter(new_changes)) + + raise ValueError( + f"Multiple authors added new changes: {new_changes}. " + "Cannot infer which author to validate." + ) diff --git a/.github/skills/xlsx/scripts/office/pack.py b/.github/skills/xlsx/scripts/office/pack.py new file mode 100644 index 0000000..db29ed8 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/pack.py @@ -0,0 +1,159 @@ +"""Pack a directory into a DOCX, PPTX, or XLSX file. + +Validates with auto-repair, condenses XML formatting, and creates the Office file. + +Usage: + python pack.py [--original ] [--validate true|false] + +Examples: + python pack.py unpacked/ output.docx --original input.docx + python pack.py unpacked/ output.pptx --validate false +""" + +import argparse +import sys +import shutil +import tempfile +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + +def pack( + input_directory: str, + output_file: str, + original_file: str | None = None, + validate: bool = True, + infer_author_func=None, +) -> tuple[None, str]: + input_dir = Path(input_directory) + output_path = Path(output_file) + suffix = output_path.suffix.lower() + + if not input_dir.is_dir(): + return None, f"Error: {input_dir} is not a directory" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {output_file} must be a .docx, .pptx, or .xlsx file" + + if validate and original_file: + original_path = Path(original_file) + if original_path.exists(): + success, output = _run_validation( + input_dir, original_path, suffix, infer_author_func + ) + if output: + print(output) + if not success: + return None, f"Error: Validation failed for {input_dir}" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + _condense_xml(xml_file) + + output_path.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + return None, f"Successfully packed {input_dir} to {output_file}" + + +def _run_validation( + unpacked_dir: Path, + original_file: Path, + suffix: str, + infer_author_func=None, +) -> tuple[bool, str | None]: + output_lines = [] + validators = [] + + if suffix == ".docx": + author = "Claude" + if infer_author_func: + try: + author = infer_author_func(unpacked_dir, original_file) + except ValueError as e: + print(f"Warning: {e} Using default author 'Claude'.", file=sys.stderr) + + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file), + RedliningValidator(unpacked_dir, original_file, author=author), + ] + elif suffix == ".pptx": + validators = [PPTXSchemaValidator(unpacked_dir, original_file)] + + if not validators: + return True, None + + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + output_lines.append(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + output_lines.append("All validations PASSED!") + + return success, "\n".join(output_lines) if output_lines else None + + +def _condense_xml(xml_file: Path) -> None: + try: + with open(xml_file, encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + for element in dom.getElementsByTagName("*"): + if element.tagName.endswith(":t"): + continue + + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + except Exception as e: + print(f"ERROR: Failed to parse {xml_file.name}: {e}", file=sys.stderr) + raise + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pack a directory into a DOCX, PPTX, or XLSX file" + ) + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument( + "--original", + help="Original file for validation comparison", + ) + parser.add_argument( + "--validate", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Run validation with auto-repair (default: true)", + ) + args = parser.parse_args() + + _, message = pack( + args.input_directory, + args.output_file, + original_file=args.original, + validate=args.validate, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 0000000..6454ef9 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 0000000..afa4f46 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 0000000..64e66b8 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 0000000..687eea8 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 0000000..6ac81b0 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 0000000..1dbf051 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..f1af17d --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..0a185ab --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 0000000..14ef488 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 0000000..c20f3bf --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 0000000..ac60252 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 0000000..424b8ba --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 0000000..2bddce2 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 0000000..8a8c18b --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 0000000..5c42706 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 0000000..853c341 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 0000000..da835ee --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 0000000..87ad265 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 0000000..9e86f1b --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 0000000..d0be42e --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 0000000..8821dd1 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 0000000..ca2575c --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 0000000..dd079e6 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 0000000..3dd6cf6 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 0000000..f1041e3 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 0000000..9c5b7a6 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 0000000..0f13678 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 0000000..a6de9d2 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 0000000..10e978b --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd b/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 0000000..4248bf7 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd b/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 0000000..5649746 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/mce/mc.xsd b/.github/skills/xlsx/scripts/office/schemas/mce/mc.xsd new file mode 100644 index 0000000..ef72545 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd new file mode 100644 index 0000000..f65f777 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd new file mode 100644 index 0000000..6b00755 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd new file mode 100644 index 0000000..f321d33 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 0000000..364c6a9 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 0000000..fed9d15 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 0000000..680cf15 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 0000000..89ada90 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/.github/skills/xlsx/scripts/office/soffice.py b/.github/skills/xlsx/scripts/office/soffice.py new file mode 100644 index 0000000..c7f7e32 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/soffice.py @@ -0,0 +1,183 @@ +""" +Helper for running LibreOffice (soffice) in environments where AF_UNIX +sockets may be blocked (e.g., sandboxed VMs). Detects the restriction +at runtime and applies an LD_PRELOAD shim if needed. + +Usage: + from office.soffice import run_soffice, get_soffice_env + + # Option 1 – run soffice directly + result = run_soffice(["--headless", "--convert-to", "pdf", "input.docx"]) + + # Option 2 – get env dict for your own subprocess calls + env = get_soffice_env() + subprocess.run(["soffice", ...], env=env) +""" + +import os +import socket +import subprocess +import tempfile +from pathlib import Path + + +def get_soffice_env() -> dict: + env = os.environ.copy() + env["SAL_USE_VCLPLUGIN"] = "svp" + + if _needs_shim(): + shim = _ensure_shim() + env["LD_PRELOAD"] = str(shim) + + return env + + +def run_soffice(args: list[str], **kwargs) -> subprocess.CompletedProcess: + env = get_soffice_env() + return subprocess.run(["soffice"] + args, env=env, **kwargs) + + + +_SHIM_SO = Path(tempfile.gettempdir()) / "lo_socket_shim.so" + + +def _needs_shim() -> bool: + try: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.close() + return False + except OSError: + return True + + +def _ensure_shim() -> Path: + if _SHIM_SO.exists(): + return _SHIM_SO + + src = Path(tempfile.gettempdir()) / "lo_socket_shim.c" + src.write_text(_SHIM_SOURCE) + subprocess.run( + ["gcc", "-shared", "-fPIC", "-o", str(_SHIM_SO), str(src), "-ldl"], + check=True, + capture_output=True, + ) + src.unlink() + return _SHIM_SO + + + +_SHIM_SOURCE = r""" +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +static int (*real_socket)(int, int, int); +static int (*real_socketpair)(int, int, int, int[2]); +static int (*real_listen)(int, int); +static int (*real_accept)(int, struct sockaddr *, socklen_t *); +static int (*real_close)(int); +static int (*real_read)(int, void *, size_t); + +/* Per-FD bookkeeping (FDs >= 1024 are passed through unshimmed). */ +static int is_shimmed[1024]; +static int peer_of[1024]; +static int wake_r[1024]; /* accept() blocks reading this */ +static int wake_w[1024]; /* close() writes to this */ +static int listener_fd = -1; /* FD that received listen() */ + +__attribute__((constructor)) +static void init(void) { + real_socket = dlsym(RTLD_NEXT, "socket"); + real_socketpair = dlsym(RTLD_NEXT, "socketpair"); + real_listen = dlsym(RTLD_NEXT, "listen"); + real_accept = dlsym(RTLD_NEXT, "accept"); + real_close = dlsym(RTLD_NEXT, "close"); + real_read = dlsym(RTLD_NEXT, "read"); + for (int i = 0; i < 1024; i++) { + peer_of[i] = -1; + wake_r[i] = -1; + wake_w[i] = -1; + } +} + +/* ---- socket ---------------------------------------------------------- */ +int socket(int domain, int type, int protocol) { + if (domain == AF_UNIX) { + int fd = real_socket(domain, type, protocol); + if (fd >= 0) return fd; + /* socket(AF_UNIX) blocked – fall back to socketpair(). */ + int sv[2]; + if (real_socketpair(domain, type, protocol, sv) == 0) { + if (sv[0] >= 0 && sv[0] < 1024) { + is_shimmed[sv[0]] = 1; + peer_of[sv[0]] = sv[1]; + int wp[2]; + if (pipe(wp) == 0) { + wake_r[sv[0]] = wp[0]; + wake_w[sv[0]] = wp[1]; + } + } + return sv[0]; + } + errno = EPERM; + return -1; + } + return real_socket(domain, type, protocol); +} + +/* ---- listen ---------------------------------------------------------- */ +int listen(int sockfd, int backlog) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + listener_fd = sockfd; + return 0; + } + return real_listen(sockfd, backlog); +} + +/* ---- accept ---------------------------------------------------------- */ +int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen) { + if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) { + /* Block until close() writes to the wake pipe. */ + if (wake_r[sockfd] >= 0) { + char buf; + real_read(wake_r[sockfd], &buf, 1); + } + errno = ECONNABORTED; + return -1; + } + return real_accept(sockfd, addr, addrlen); +} + +/* ---- close ----------------------------------------------------------- */ +int close(int fd) { + if (fd >= 0 && fd < 1024 && is_shimmed[fd]) { + int was_listener = (fd == listener_fd); + is_shimmed[fd] = 0; + + if (wake_w[fd] >= 0) { /* unblock accept() */ + char c = 0; + write(wake_w[fd], &c, 1); + real_close(wake_w[fd]); + wake_w[fd] = -1; + } + if (wake_r[fd] >= 0) { real_close(wake_r[fd]); wake_r[fd] = -1; } + if (peer_of[fd] >= 0) { real_close(peer_of[fd]); peer_of[fd] = -1; } + + if (was_listener) + _exit(0); /* conversion done – exit */ + } + return real_close(fd); +} +""" + + + +if __name__ == "__main__": + import sys + result = run_soffice(sys.argv[1:]) + sys.exit(result.returncode) diff --git a/.github/skills/xlsx/scripts/office/unpack.py b/.github/skills/xlsx/scripts/office/unpack.py new file mode 100644 index 0000000..0015253 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/unpack.py @@ -0,0 +1,132 @@ +"""Unpack Office files (DOCX, PPTX, XLSX) for editing. + +Extracts the ZIP archive, pretty-prints XML files, and optionally: +- Merges adjacent runs with identical formatting (DOCX only) +- Simplifies adjacent tracked changes from same author (DOCX only) + +Usage: + python unpack.py [options] + +Examples: + python unpack.py document.docx unpacked/ + python unpack.py presentation.pptx unpacked/ + python unpack.py document.docx unpacked/ --merge-runs false +""" + +import argparse +import sys +import zipfile +from pathlib import Path + +import defusedxml.minidom + +from helpers.merge_runs import merge_runs as do_merge_runs +from helpers.simplify_redlines import simplify_redlines as do_simplify_redlines + +SMART_QUOTE_REPLACEMENTS = { + "\u201c": "“", + "\u201d": "”", + "\u2018": "‘", + "\u2019": "’", +} + + +def unpack( + input_file: str, + output_directory: str, + merge_runs: bool = True, + simplify_redlines: bool = True, +) -> tuple[None, str]: + input_path = Path(input_file) + output_path = Path(output_directory) + suffix = input_path.suffix.lower() + + if not input_path.exists(): + return None, f"Error: {input_file} does not exist" + + if suffix not in {".docx", ".pptx", ".xlsx"}: + return None, f"Error: {input_file} must be a .docx, .pptx, or .xlsx file" + + try: + output_path.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(input_path, "r") as zf: + zf.extractall(output_path) + + xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) + for xml_file in xml_files: + _pretty_print_xml(xml_file) + + message = f"Unpacked {input_file} ({len(xml_files)} XML files)" + + if suffix == ".docx": + if simplify_redlines: + simplify_count, _ = do_simplify_redlines(str(output_path)) + message += f", simplified {simplify_count} tracked changes" + + if merge_runs: + merge_count, _ = do_merge_runs(str(output_path)) + message += f", merged {merge_count} runs" + + for xml_file in xml_files: + _escape_smart_quotes(xml_file) + + return None, message + + except zipfile.BadZipFile: + return None, f"Error: {input_file} is not a valid Office file" + except Exception as e: + return None, f"Error unpacking: {e}" + + +def _pretty_print_xml(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="utf-8")) + except Exception: + pass + + +def _escape_smart_quotes(xml_file: Path) -> None: + try: + content = xml_file.read_text(encoding="utf-8") + for char, entity in SMART_QUOTE_REPLACEMENTS.items(): + content = content.replace(char, entity) + xml_file.write_text(content, encoding="utf-8") + except Exception: + pass + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Unpack an Office file (DOCX, PPTX, XLSX) for editing" + ) + parser.add_argument("input_file", help="Office file to unpack") + parser.add_argument("output_directory", help="Output directory") + parser.add_argument( + "--merge-runs", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent runs with identical formatting (DOCX only, default: true)", + ) + parser.add_argument( + "--simplify-redlines", + type=lambda x: x.lower() == "true", + default=True, + metavar="true|false", + help="Merge adjacent tracked changes from same author (DOCX only, default: true)", + ) + args = parser.parse_args() + + _, message = unpack( + args.input_file, + args.output_directory, + merge_runs=args.merge_runs, + simplify_redlines=args.simplify_redlines, + ) + print(message) + + if "Error" in message: + sys.exit(1) diff --git a/.github/skills/xlsx/scripts/office/validate.py b/.github/skills/xlsx/scripts/office/validate.py new file mode 100644 index 0000000..03b01f6 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/validate.py @@ -0,0 +1,111 @@ +""" +Command line tool to validate Office document XML files against XSD schemas and tracked changes. + +Usage: + python validate.py [--original ] [--auto-repair] [--author NAME] + +The first argument can be either: +- An unpacked directory containing the Office document XML files +- A packed Office file (.docx/.pptx/.xlsx) which will be unpacked to a temp directory + +Auto-repair fixes: +- paraId/durableId values that exceed OOXML limits +- Missing xml:space="preserve" on w:t elements with whitespace +""" + +import argparse +import sys +import tempfile +import zipfile +from pathlib import Path + +from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Validate Office document XML files") + parser.add_argument( + "path", + help="Path to unpacked directory or packed Office file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "--original", + required=False, + default=None, + help="Path to original file (.docx/.pptx/.xlsx). If omitted, all XSD errors are reported and redlining validation is skipped.", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--auto-repair", + action="store_true", + help="Automatically repair common issues (hex IDs, whitespace preservation)", + ) + parser.add_argument( + "--author", + default="Claude", + help="Author name for redlining validation (default: Claude)", + ) + args = parser.parse_args() + + path = Path(args.path) + assert path.exists(), f"Error: {path} does not exist" + + original_file = None + if args.original: + original_file = Path(args.original) + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert original_file.suffix.lower() in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + file_extension = (original_file or path).suffix.lower() + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: Cannot determine file type from {path}. Use --original or provide a .docx/.pptx/.xlsx file." + ) + + if path.is_file() and path.suffix.lower() in [".docx", ".pptx", ".xlsx"]: + temp_dir = tempfile.mkdtemp() + with zipfile.ZipFile(path, "r") as zf: + zf.extractall(temp_dir) + unpacked_dir = Path(temp_dir) + else: + assert path.is_dir(), f"Error: {path} is not a directory or Office file" + unpacked_dir = path + + match file_extension: + case ".docx": + validators = [ + DOCXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + if original_file: + validators.append( + RedliningValidator(unpacked_dir, original_file, verbose=args.verbose, author=args.author) + ) + case ".pptx": + validators = [ + PPTXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose), + ] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + if args.auto_repair: + total_repairs = sum(v.repair() for v in validators) + if total_repairs: + print(f"Auto-repaired {total_repairs} issue(s)") + + success = all(v.validate() for v in validators) + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/.github/skills/xlsx/scripts/office/validators/__init__.py b/.github/skills/xlsx/scripts/office/validators/__init__.py new file mode 100644 index 0000000..db092ec --- /dev/null +++ b/.github/skills/xlsx/scripts/office/validators/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/.github/skills/xlsx/scripts/office/validators/base.py b/.github/skills/xlsx/scripts/office/validators/base.py new file mode 100644 index 0000000..db4a06a --- /dev/null +++ b/.github/skills/xlsx/scripts/office/validators/base.py @@ -0,0 +1,847 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import defusedxml.minidom +import lxml.etree + + +class BaseSchemaValidator: + + IGNORED_VALIDATION_ERRORS = [ + "hyphenationZone", + "purl.org/dc/terms", + ] + + UNIQUE_ID_REQUIREMENTS = { + "comment": ("id", "file"), + "commentrangestart": ("id", "file"), + "commentrangeend": ("id", "file"), + "bookmarkstart": ("id", "file"), + "bookmarkend": ("id", "file"), + "sldid": ("id", "file"), + "sldmasterid": ("id", "global"), + "sldlayoutid": ("id", "global"), + "cm": ("authorid", "file"), + "sheet": ("sheetid", "file"), + "definedname": ("id", "file"), + "cxnsp": ("id", "file"), + "sp": ("id", "file"), + "pic": ("id", "file"), + "grpsp": ("id", "file"), + } + + EXCLUDED_ID_CONTAINERS = { + "sectionlst", + } + + ELEMENT_RELATIONSHIP_TYPES = {} + + SCHEMA_MAPPINGS = { + "word": "ISO-IEC29500-4_2016/wml.xsd", + "ppt": "ISO-IEC29500-4_2016/pml.xsd", + "xl": "ISO-IEC29500-4_2016/sml.xsd", + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file=None, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) if original_file else None + self.verbose = verbose + + self.schemas_dir = Path(__file__).parent.parent / "schemas" + + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + raise NotImplementedError("Subclasses must implement the validate method") + + def repair(self) -> int: + return self.repair_whitespace_preservation() + + def repair_whitespace_preservation(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if elem.tagName.endswith(":t") and elem.firstChild: + text = elem.firstChild.nodeValue + if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))): + if elem.getAttribute("xml:space") != "preserve": + elem.setAttribute("xml:space", "preserve") + text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text) + print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}") + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + def validate_xml(self): + errors = [] + + for xml_file in self.xml_files: + try: + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + errors = [] + global_ids = {} + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} + + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + for elem in root.iter(): + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + if tag in self.UNIQUE_ID_REQUIREMENTS: + in_excluded_container = any( + ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS + for ancestor in elem.iterancestors() + ) + if in_excluded_container: + continue + + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + errors = [] + + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): + all_files.append(file_path.resolve()) + + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + for rels_file in rels_files: + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + rels_dir = rels_file.parent + + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): + if target.startswith("/"): + target_path = self.unpacked_dir / target.lstrip("/") + elif rels_file.name == ".rels": + target_path = self.unpacked_dir / target + else: + base_dir = rels_dir.parent + target_path = base_dir / target + + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + import lxml.etree + + errors = [] + + for xml_file in self.xml_files: + if xml_file.suffix == ".rels": + continue + + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + if not rels_file.exists(): + continue + + try: + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE + rid_attrs_to_check = ["id", "embed", "link"] + for elem in xml_root.iter(): + for attr_name in rid_attrs_to_check: + rid_attr = elem.get(f"{{{r_ns}}}{attr_name}") + if not rid_attr: + continue + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + elem_lower = element_name.lower() + + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + if elem_lower.endswith("id") and len(elem_lower) > 2: + prefix = elem_lower[:-2] + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + if prefix == "sld": + return "slide" + return prefix.lower() + + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] + return prefix.lower() + + return None + + def validate_content_types(self): + errors = [] + + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", + "document", + "workbook", + "worksheet", + "theme", + } + + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue + + for file_path in all_files: + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() + elif is_valid: + return True, set() + + original_errors = self._get_original_file_errors(xml_file) + + assert current_errors is not None + new_errors = current_errors - original_errors + + new_errors = { + e for e in new_errors + if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS) + } + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + original_error_count += 1 + valid_count += 1 + continue + + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + for attr in attrs_to_remove: + del elem.attrib[attr] + + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + elements_to_remove = [] + + for elem in list(root): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + self._remove_ignorable_elements(elem) + + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + root = xml_doc.getroot() + + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None + + try: + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + if self.original_file is None: + return set() + + import tempfile + import zipfile + + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + return set() + + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + for elem in xml_copy.iter(): + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/xlsx/scripts/office/validators/docx.py b/.github/skills/xlsx/scripts/office/validators/docx.py new file mode 100644 index 0000000..fec405e --- /dev/null +++ b/.github/skills/xlsx/scripts/office/validators/docx.py @@ -0,0 +1,446 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import random +import re +import tempfile +import zipfile + +import defusedxml.minidom +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml" + W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid" + + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_whitespace_preservation(): + all_valid = False + + if not self.validate_deletions(): + all_valid = False + + if not self.validate_insertions(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_id_constraints(): + all_valid = False + + if not self.validate_comment_markers(): + all_valid = False + + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + if re.search(r"^[ \t\n\r]", text) or re.search( + r"[ \t\n\r]$", text + ): + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces): + if t_elem.text: + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + for instr_elem in root.xpath( + ".//w:del//w:instrText", namespaces=namespaces + ): + text_preview = ( + repr(instr_elem.text or "")[:50] + "..." + if len(repr(instr_elem.text or "")) > 50 + else repr(instr_elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {instr_elem.sourceline}: found within (use ): {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + count = 0 + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + original = self.original_file + if original is None: + return 0 + + count = 0 + + try: + with tempfile.TemporaryDirectory() as temp_dir: + with zipfile.ZipFile(original, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + def _parse_id_value(self, val: str, base: int = 16) -> int: + return int(val, base) + + def validate_id_constraints(self): + errors = [] + para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId" + durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId" + + for xml_file in self.xml_files: + try: + for elem in lxml.etree.parse(str(xml_file)).iter(): + if val := elem.get(para_id_attr): + if self._parse_id_value(val, base=16) >= 0x80000000: + errors.append( + f" {xml_file.name}:{elem.sourceline}: paraId={val} >= 0x80000000" + ) + + if val := elem.get(durable_id_attr): + if xml_file.name == "numbering.xml": + try: + if self._parse_id_value(val, base=10) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except ValueError: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} must be decimal in numbering.xml" + ) + else: + if self._parse_id_value(val, base=16) >= 0x7FFFFFFF: + errors.append( + f" {xml_file.name}:{elem.sourceline}: " + f"durableId={val} >= 0x7FFFFFFF" + ) + except Exception: + pass + + if errors: + print(f"FAILED - {len(errors)} ID constraint violations:") + for e in errors: + print(e) + elif self.verbose: + print("PASSED - All paraId/durableId values within constraints") + return not errors + + def validate_comment_markers(self): + errors = [] + + document_xml = None + comments_xml = None + for xml_file in self.xml_files: + if xml_file.name == "document.xml" and "word" in str(xml_file): + document_xml = xml_file + elif xml_file.name == "comments.xml": + comments_xml = xml_file + + if not document_xml: + if self.verbose: + print("PASSED - No document.xml found (skipping comment validation)") + return True + + try: + doc_root = lxml.etree.parse(str(document_xml)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + range_starts = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeStart", namespaces=namespaces + ) + } + range_ends = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentRangeEnd", namespaces=namespaces + ) + } + references = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in doc_root.xpath( + ".//w:commentReference", namespaces=namespaces + ) + } + + orphaned_ends = range_ends - range_starts + for comment_id in sorted( + orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart' + ) + + orphaned_starts = range_starts - range_ends + for comment_id in sorted( + orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + errors.append( + f' document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd' + ) + + comment_ids = set() + if comments_xml and comments_xml.exists(): + comments_root = lxml.etree.parse(str(comments_xml)).getroot() + comment_ids = { + elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id") + for elem in comments_root.xpath( + ".//w:comment", namespaces=namespaces + ) + } + + marker_ids = range_starts | range_ends | references + invalid_refs = marker_ids - comment_ids + for comment_id in sorted( + invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0 + ): + if comment_id: + errors.append( + f' document.xml: marker id="{comment_id}" references non-existent comment' + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append(f" Error parsing XML: {e}") + + if errors: + print(f"FAILED - {len(errors)} comment marker violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All comment markers properly paired") + return True + + def repair(self) -> int: + repairs = super().repair() + repairs += self.repair_durableId() + return repairs + + def repair_durableId(self) -> int: + repairs = 0 + + for xml_file in self.xml_files: + try: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + modified = False + + for elem in dom.getElementsByTagName("*"): + if not elem.hasAttribute("w16cid:durableId"): + continue + + durable_id = elem.getAttribute("w16cid:durableId") + needs_repair = False + + if xml_file.name == "numbering.xml": + try: + needs_repair = ( + self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + else: + try: + needs_repair = ( + self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF + ) + except ValueError: + needs_repair = True + + if needs_repair: + value = random.randint(1, 0x7FFFFFFE) + if xml_file.name == "numbering.xml": + new_id = str(value) + else: + new_id = f"{value:08X}" + + elem.setAttribute("w16cid:durableId", new_id) + print( + f" Repaired: {xml_file.name}: durableId {durable_id} → {new_id}" + ) + repairs += 1 + modified = True + + if modified: + xml_file.write_bytes(dom.toxml(encoding="UTF-8")) + + except Exception: + pass + + return repairs + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/xlsx/scripts/office/validators/pptx.py b/.github/skills/xlsx/scripts/office/validators/pptx.py new file mode 100644 index 0000000..09842aa --- /dev/null +++ b/.github/skills/xlsx/scripts/office/validators/pptx.py @@ -0,0 +1,275 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + if not self.validate_xml(): + return False + + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + if not self.validate_unique_ids(): + all_valid = False + + if not self.validate_uuid_ids(): + all_valid = False + + if not self.validate_file_references(): + all_valid = False + + if not self.validate_slide_layout_ids(): + all_valid = False + + if not self.validate_content_types(): + all_valid = False + + if not self.validate_against_xsd(): + all_valid = False + + if not self.validate_notes_slide_references(): + all_valid = False + + if not self.validate_all_relationship_ids(): + all_valid = False + + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + import lxml.etree + + errors = [] + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + for elem in root.iter(): + for attr, value in elem.attrib.items(): + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + if self._looks_like_uuid(value): + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + clean_value = value.strip("{}()").replace("-", "") + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + import lxml.etree + + errors = [] + + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + root = lxml.etree.parse(str(slide_master)).getroot() + + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + import lxml.etree + + errors = [] + notes_slide_references = {} + + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + normalized_target = target.replace("../", "") + + slide_name = rels_file.stem.replace( + ".xml", "" + ) + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/xlsx/scripts/office/validators/redlining.py b/.github/skills/xlsx/scripts/office/validators/redlining.py new file mode 100644 index 0000000..71c81b6 --- /dev/null +++ b/.github/skills/xlsx/scripts/office/validators/redlining.py @@ -0,0 +1,247 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + + def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.author = author + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def repair(self) -> int: + return 0 + + def validate(self): + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + author_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + author_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == self.author + ] + + if not author_del_elements and not author_ins_elements: + if self.verbose: + print(f"PASSED - No tracked changes by {self.author} found.") + return True + + except Exception: + pass + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + self._remove_author_tracked_changes(original_root) + self._remove_author_tracked_changes(modified_root) + + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print(f"PASSED - All changes by {self.author} are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + error_parts = [ + f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + pass + + return None + + def _remove_author_tracked_changes(self, root): + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == self.author: + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == self.author: + to_process.append((child, list(parent).index(child))) + + for del_elem, del_index in reversed(to_process): + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/.github/skills/xlsx/scripts/recalc.py b/.github/skills/xlsx/scripts/recalc.py new file mode 100644 index 0000000..f472e9a --- /dev/null +++ b/.github/skills/xlsx/scripts/recalc.py @@ -0,0 +1,184 @@ +""" +Excel Formula Recalculation Script +Recalculates all formulas in an Excel file using LibreOffice +""" + +import json +import os +import platform +import subprocess +import sys +from pathlib import Path + +from office.soffice import get_soffice_env + +from openpyxl import load_workbook + +MACRO_DIR_MACOS = "~/Library/Application Support/LibreOffice/4/user/basic/Standard" +MACRO_DIR_LINUX = "~/.config/libreoffice/4/user/basic/Standard" +MACRO_FILENAME = "Module1.xba" + +RECALCULATE_MACRO = """ + + + Sub RecalculateAndSave() + ThisComponent.calculateAll() + ThisComponent.store() + ThisComponent.close(True) + End Sub +""" + + +def has_gtimeout(): + try: + subprocess.run( + ["gtimeout", "--version"], capture_output=True, timeout=1, check=False + ) + return True + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +def setup_libreoffice_macro(): + macro_dir = os.path.expanduser( + MACRO_DIR_MACOS if platform.system() == "Darwin" else MACRO_DIR_LINUX + ) + macro_file = os.path.join(macro_dir, MACRO_FILENAME) + + if ( + os.path.exists(macro_file) + and "RecalculateAndSave" in Path(macro_file).read_text() + ): + return True + + if not os.path.exists(macro_dir): + subprocess.run( + ["soffice", "--headless", "--terminate_after_init"], + capture_output=True, + timeout=10, + env=get_soffice_env(), + ) + os.makedirs(macro_dir, exist_ok=True) + + try: + Path(macro_file).write_text(RECALCULATE_MACRO) + return True + except Exception: + return False + + +def recalc(filename, timeout=30): + if not Path(filename).exists(): + return {"error": f"File {filename} does not exist"} + + abs_path = str(Path(filename).absolute()) + + if not setup_libreoffice_macro(): + return {"error": "Failed to setup LibreOffice macro"} + + cmd = [ + "soffice", + "--headless", + "--norestore", + "vnd.sun.star.script:Standard.Module1.RecalculateAndSave?language=Basic&location=application", + abs_path, + ] + + if platform.system() == "Linux": + cmd = ["timeout", str(timeout)] + cmd + elif platform.system() == "Darwin" and has_gtimeout(): + cmd = ["gtimeout", str(timeout)] + cmd + + result = subprocess.run(cmd, capture_output=True, text=True, env=get_soffice_env()) + + if result.returncode != 0 and result.returncode != 124: + error_msg = result.stderr or "Unknown error during recalculation" + if "Module1" in error_msg or "RecalculateAndSave" not in error_msg: + return {"error": "LibreOffice macro not configured properly"} + return {"error": error_msg} + + try: + wb = load_workbook(filename, data_only=True) + + excel_errors = [ + "#VALUE!", + "#DIV/0!", + "#REF!", + "#NAME?", + "#NULL!", + "#NUM!", + "#N/A", + ] + error_details = {err: [] for err in excel_errors} + total_errors = 0 + + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + for row in ws.iter_rows(): + for cell in row: + if cell.value is not None and isinstance(cell.value, str): + for err in excel_errors: + if err in cell.value: + location = f"{sheet_name}!{cell.coordinate}" + error_details[err].append(location) + total_errors += 1 + break + + wb.close() + + result = { + "status": "success" if total_errors == 0 else "errors_found", + "total_errors": total_errors, + "error_summary": {}, + } + + for err_type, locations in error_details.items(): + if locations: + result["error_summary"][err_type] = { + "count": len(locations), + "locations": locations[:20], + } + + wb_formulas = load_workbook(filename, data_only=False) + formula_count = 0 + for sheet_name in wb_formulas.sheetnames: + ws = wb_formulas[sheet_name] + for row in ws.iter_rows(): + for cell in row: + if ( + cell.value + and isinstance(cell.value, str) + and cell.value.startswith("=") + ): + formula_count += 1 + wb_formulas.close() + + result["total_formulas"] = formula_count + + return result + + except Exception as e: + return {"error": str(e)} + + +def main(): + if len(sys.argv) < 2: + print("Usage: python recalc.py [timeout_seconds]") + print("\nRecalculates all formulas in an Excel file using LibreOffice") + print("\nReturns JSON with error details:") + print(" - status: 'success' or 'errors_found'") + print(" - total_errors: Total number of Excel errors found") + print(" - total_formulas: Number of formulas in the file") + print(" - error_summary: Breakdown by error type with locations") + print(" - #VALUE!, #DIV/0!, #REF!, #NAME?, #NULL!, #NUM!, #N/A") + sys.exit(1) + + filename = sys.argv[1] + timeout = int(sys.argv[2]) if len(sys.argv) > 2 else 30 + + result = recalc(filename, timeout) + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() From 1522f3f3790931e6ceb3f46996aeb8fad7f0c8f7 Mon Sep 17 00:00:00 2001 From: emmanuelknafo <48259636+emmanuelknafo@users.noreply.github.com> Date: Thu, 12 Mar 2026 00:00:32 -0400 Subject: [PATCH 2/2] feat(workflows): enable automatic deployment on push to main Fixes AB#2020 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - uncomment push trigger on branches: [main] in deploy workflow - retain workflow_dispatch for manual runs 🚀 - Generated by Copilot --- .github/workflows/deploy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index d726a26..5a158a6 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -1,8 +1,8 @@ name: Build and Deploy to Azure on: - # push: - # branches: [main] + push: + branches: [main] workflow_dispatch: env: