-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDocxToMarkdown.py
More file actions
140 lines (107 loc) · 6.83 KB
/
DocxToMarkdown.py
File metadata and controls
140 lines (107 loc) · 6.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
########################################################################################################################################################
# Title: DocxToMarkdown.py
# Author(s): Dr. Gail Zhou & GitHub CoPiLot
# Created: April 2024
# Description: This script converts all .docx files in a directory to .md files.
# It also cleans the .md files by removing the lines that contain the patterns: ":
colorama_init()
try:
# Get configuration settings
load_dotenv()
# Normally the clean_md flag is set to True. If it is set to False, the script will not clean the markdown files.
clean_md = os.getenv("CLEAN_MD")
clean_md_flag = False
if (clean_md in ["True", "true", "TRUE", "T", "t", "Y", "y", "Yes", "yes", "YES"]):
clean_md_flag = True
else:
clean_md_flag = False
# Ask for the input directory path
input_directory_path = input(f"{Fore.CYAN} Enter the input directory path for .docx files: {Fore.RESET}")
print(f"Input directory: {input_directory_path}")
# Ask for the output directory path
raw_md_directory_path = input(f"{Fore.YELLOW} Enter the output directory path for RAW .md files: {Fore.RESET}")
print(f"Output directory for raw MD files: {raw_md_directory_path}")
if os.path.exists(raw_md_directory_path):
print(f"{Fore.YELLOW}The {raw_md_directory_path} already exists. It is removed. New directory with same name is now created. {Fore.RESET}")
shutil.rmtree(raw_md_directory_path)
os.makedirs(raw_md_directory_path)
# Ask for the output directory path
if clean_md_flag:
clean_output_directory_path = input(f"{Fore.CYAN} Enter the output directory path for CLEAN .md files: {Fore.RESET}")
print(f"Output directory for CLEAN .md files: {clean_output_directory_path}")
if os.path.exists(clean_output_directory_path):
print(f"{Fore.YELLOW}The {clean_output_directory_path} already exists. It is removed. New directory with same name is now created.{Fore.RESET}")
shutil.rmtree(clean_output_directory_path)
os.makedirs(clean_output_directory_path)
print(f"{Fore.CYAN}\n\n ******************************* Converting .docx files into Markdown Files in RAW format ******************************\n\n {Fore.RESET}")
# Recursively walk through all files in the input directory
for root, dirs, files in os.walk(input_directory_path):
for file_name in files:
file_extention = Path(file_name).suffix
if file_name.endswith(('.docx', '.txt')):
input_file_full_path = os.path.join(root, file_name)
file_name_without_extension = Path(file_name).stem
output_file_name = f"{file_name_without_extension}.md"
output_file_full_path = os.path.join(raw_md_directory_path, output_file_name)
if not os.path.exists(input_file_full_path):
print(f"{Fore.RED}The {input_file_full_path} does not exist.{Fore.RESET}")
continue
# Expect one of these as input file format:
# biblatex, bibtex, commonmark, commonmark_x, creole, csljson, csv, docbook, docx, dokuwiki, endnotexml, epub, fb2, gfm, haddock,
# html, ipynb, jats, jira, json, latex, man, markdown, markdown_github, markdown_mmd, markdown_phpextra, markdown_strict,
# mediawiki, muse, native, odt, opml, org, ris, rst, rtf, t2t, textile, tikiwiki, tsv, twiki, vimwiki
if file_extention == '.docx':
print(f"{Fore.YELLOW}Processing {input_file_full_path}{Fore.RESET}")
pypandoc.convert_file(input_file_full_path, 'md', outputfile=output_file_full_path)
print(f"{Fore.CYAN} output: {output_file_full_path} {Fore.RESET}")
if file_extention == '.txt':
print("")
#print(f"{Fore.RED} Unble to take .txt files yet. {file_name} ignored. {Fore.RESET}")
# Clean the markdown files if the clean_md flag is set to True
print(f"{Fore.CYAN}\n\n ******************************* Cleaning up Markdown Files *******************************\n\n{Fore.RESET}")
if clean_md_flag:
# Recursively walk through all files in the input directory
for root, dirs, files in os.walk( raw_md_directory_path):
for file_name in files:
if file_name.endswith('.md'):
file_name_without_extension = Path(file_name).stem
clean_md_file_name = f"{file_name_without_extension}.md"
input_file_full_path = os.path.join(root, file_name)
output_file_full_path = os.path.join(clean_output_directory_path, clean_md_file_name)
print(f"{Fore.CYAN} input: {input_file_full_path} {Fore.RESET}")
# Read the input file
with open(input_file_full_path, "r") as f:
lines = f.readlines()
with open(output_file_full_path, "w") as f:
for line in lines:
# if line contains backslash, remove it
newline = line.replace("\\", "")
# skip the line if it contains the pattern
if not lineContainsPatterns(newline):
f.write(newline)
print(f"{Fore.GREEN} output: {output_file_full_path} {Fore.RESET}")
print("")
print(f"{Fore.GREEN} ******************************* Done. *******************************\n {Fore.RESET}")
except Exception as ex:
print(ex)
def lineContainsPatterns(line):
patterns = ["