Skip to content

Commit 0b3342d

Browse files
committed
commit many fixes/update: broken state
1 parent 03e83b0 commit 0b3342d

25 files changed

Lines changed: 1823 additions & 950 deletions

api/text_pair.py

Lines changed: 185 additions & 132 deletions
Large diffs are not rendered by default.

extras/backup_database.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@
1515

1616

1717
def table_exists(user, password, table_name):
18-
conn = psycopg2.connect(database=GLOBAL_CONFIG.get("DATABASE", "database_name"), user=user, password=password)
18+
conn = psycopg2.connect(
19+
database=GLOBAL_CONFIG.get("DATABASE", "database_name"),
20+
user=user,
21+
password=password,
22+
)
1923
with conn.cursor() as cursor:
2024
cursor.execute("SELECT 1 FROM information_schema.tables WHERE table_name=%s", (table_name,))
2125
result = cursor.fetchone()
@@ -31,13 +35,13 @@ def back_up_philo_db_data(philo_db_path, output_path):
3135
text_path = output_path / "TEXT"
3236
text_path.mkdir()
3337
print(" - Copying TEXT files...")
34-
for file in os.scandir(philo_db_path / 'data/TEXT/'):
38+
for file in os.scandir(philo_db_path / "data/TEXT/"):
3539
shutil.copy(file.path, text_path)
3640

3741
# Copy db related files:
3842
print(" - Copying database files...")
39-
shutil.copy(philo_db_path / 'data/toms.db', output_path)
40-
shutil.copy(philo_db_path / 'data/db.locals.py', output_path)
43+
shutil.copy(philo_db_path / "data/toms.db", output_path)
44+
shutil.copy(philo_db_path / "data/db.locals.py", output_path)
4145
print(" ✓ PhiloLogic database backup complete")
4246

4347

@@ -112,12 +116,12 @@ def extract_textpair_database(table, web_app_path, output_path):
112116

113117
print(" - Compressing with LZ4...")
114118
# Read the tar file and compress with lz4
115-
with open(temp_tar, 'rb') as f:
119+
with open(temp_tar, "rb") as f:
116120
tar_data = f.read()
117121
compressed_data = lz4.frame.compress(tar_data, compression_level=3)
118122

119123
# Write the compressed data
120-
with open(tar_path, 'wb') as f:
124+
with open(tar_path, "wb") as f:
121125
f.write(compressed_data)
122126

123127
# Clean up temporary tar file
@@ -144,4 +148,4 @@ def extract_textpair_database(table, web_app_path, output_path):
144148
output_path = args.output_path or os.getcwd()
145149
web_app_path = args.web_app_path or os.path.join(GLOBAL_CONFIG.get("WEB_APP", "web_app_path"), args.db_name)
146150
web_app_path = web_app_path.rstrip("/")
147-
extract_textpair_database(args.db_name, web_app_path, output_path)
151+
extract_textpair_database(args.db_name, web_app_path, output_path)

extras/restore_database.py

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def check_database_connection(user, password):
2121
conn = psycopg2.connect(
2222
database=GLOBAL_CONFIG.get("DATABASE", "database_name"),
2323
user=user,
24-
password=password
24+
password=password,
2525
)
2626
conn.close()
2727
return True
@@ -48,28 +48,28 @@ def update_app_config(web_app_path):
4848

4949
# Update the apiServer value
5050
api_server = GLOBAL_CONFIG.get("WEB_APP", "api_server")
51-
config['apiServer'] = api_server
51+
config["apiServer"] = api_server
5252

5353
# Update PhiloLogic paths to point to the restored data
5454
source_data_path = web_app_path / "source_data"
5555
if source_data_path.exists():
56-
config['sourcePhiloDBPath'] = str(source_data_path.absolute())
56+
config["sourcePhiloDBPath"] = str(source_data_path.absolute())
5757

5858
target_data_path = web_app_path / "target_data"
5959
if target_data_path.exists():
60-
config['targetPhiloDBPath'] = str(target_data_path.absolute())
61-
elif 'targetPhiloDBPath' in config:
60+
config["targetPhiloDBPath"] = str(target_data_path.absolute())
61+
elif "targetPhiloDBPath" in config:
6262
# If target_data doesn't exist and there was a target path, remove it
63-
config['targetPhiloDBPath'] = ""
63+
config["targetPhiloDBPath"] = ""
6464

6565
# Write the updated config back
66-
with open(config_path, 'w') as f:
66+
with open(config_path, "w") as f:
6767
json.dump(config, f, indent=2)
6868

6969
print(f"Updated appConfig.json:")
7070
print(f" - apiServer: {api_server}")
7171
print(f" - sourcePhiloDBPath: {config['sourcePhiloDBPath']}")
72-
if config.get('targetPhiloDBPath'):
72+
if config.get("targetPhiloDBPath"):
7373
print(f" - targetPhiloDBPath: {config['targetPhiloDBPath']}")
7474

7575
return True
@@ -91,11 +91,11 @@ def run_npm_build(web_app_path):
9191

9292
# Run npm install
9393
print("Running npm install...")
94-
subprocess.run(['npm', 'install'], check=True)
94+
subprocess.run(["npm", "install"], check=True)
9595

9696
# Run npm build
9797
print("Running npm run build...")
98-
subprocess.run(['npm', 'run', 'build'], check=True)
98+
subprocess.run(["npm", "run", "build"], check=True)
9999

100100
return True
101101

@@ -117,12 +117,12 @@ def check_existing_resources(db_name, db_user, db_password, web_app_dest, backup
117117
# Check for existing tables
118118
sql_files = list(backup_dir.glob("textpair_*.sql"))
119119
for sql_file in sql_files:
120-
table_name = sql_file.stem.replace('textpair_', '')
120+
table_name = sql_file.stem.replace("textpair_", "")
121121
with psycopg2.connect(database=db_name, user=db_user, password=db_password) as conn:
122122
with conn.cursor() as cursor:
123123
cursor.execute(
124124
"SELECT 1 FROM information_schema.tables WHERE table_name = %s",
125-
(table_name,)
125+
(table_name,),
126126
)
127127
if cursor.fetchone() is not None:
128128
existing_resources.append(f"database table '{table_name}'")
@@ -176,12 +176,12 @@ def restore_textpair_database(backup_path, web_app_dest=None, force=False):
176176
# Extract the tarball using lz4 module
177177
print("\nExtracting backup archive...")
178178
print(" - Decompressing with LZ4...")
179-
with open(backup_path, 'rb') as f:
179+
with open(backup_path, "rb") as f:
180180
compressed_data = f.read()
181181
decompressed_data = lz4.frame.decompress(compressed_data)
182182
print(" - Extracting files...")
183183
temp_tar = temp_dir / "temp.tar"
184-
with open(temp_tar, 'wb') as f:
184+
with open(temp_tar, "wb") as f:
185185
f.write(decompressed_data)
186186
os.system(f"tar xf {temp_tar} -C {temp_dir}")
187187
os.remove(temp_tar)
@@ -209,8 +209,10 @@ def restore_textpair_database(backup_path, web_app_dest=None, force=False):
209209
print("\nWARNING: The following resources will be overwritten:")
210210
for resource in existing:
211211
print(f" - {resource}")
212-
response = input("\nDo you want to proceed with the restoration? This will replace all existing resources (y/n): ")
213-
if response.lower() != 'y':
212+
response = input(
213+
"\nDo you want to proceed with the restoration? This will replace all existing resources (y/n): "
214+
)
215+
if response.lower() != "y":
214216
print("Restoration cancelled")
215217
return
216218
print("") # Empty line for better readability
@@ -224,7 +226,7 @@ def restore_textpair_database(backup_path, web_app_dest=None, force=False):
224226
print(f"Found {len(sql_files)} tables to restore")
225227

226228
for sql_file in sql_files:
227-
table_name = sql_file.stem.replace('textpair_', '')
229+
table_name = sql_file.stem.replace("textpair_", "")
228230

229231
# Drop existing table if it exists
230232
print(f" - Processing {table_name}:")
@@ -236,7 +238,7 @@ def restore_textpair_database(backup_path, web_app_dest=None, force=False):
236238

237239
# Restore table
238240
print(f" • Restoring table data...")
239-
os.system(f'PGPASSWORD={db_password} psql -U {db_user} -d {db_name} -f {sql_file}')
241+
os.system(f"PGPASSWORD={db_password} psql -U {db_user} -d {db_name} -f {sql_file}")
240242
print(f" ✓ Table {table_name} restored")
241243

242244
print("✓ Database restoration complete")
@@ -290,10 +292,17 @@ def restore_textpair_database(backup_path, web_app_dest=None, force=False):
290292
if __name__ == "__main__":
291293
parser = ArgumentParser()
292294
parser.add_argument("backup_path", type=str, help="Path to the backup tarball file")
293-
parser.add_argument("--web_app_dest", type=str, default="",
294-
help="Optional destination path for web app files")
295-
parser.add_argument("--force", action="store_true",
296-
help="Overwrite existing files/tables without prompting")
295+
parser.add_argument(
296+
"--web_app_dest",
297+
type=str,
298+
default="",
299+
help="Optional destination path for web app files",
300+
)
301+
parser.add_argument(
302+
"--force",
303+
action="store_true",
304+
help="Overwrite existing files/tables without prompting",
305+
)
297306
args = parser.parse_args()
298307

299-
restore_textpair_database(args.backup_path, args.web_app_dest, args.force)
308+
restore_textpair_database(args.backup_path, args.web_app_dest, args.force)

lib/textpair/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
"""Global imports for main textpair function"""
2+
23
from .parse_config import get_config
34
from .passage_classifier import classify_passages
4-
from .sequence_alignment import Ngrams, banality_auto_detect, merge_alignments, phrase_matcher
5+
from .sequence_alignment import (
6+
Ngrams,
7+
banality_auto_detect,
8+
merge_alignments,
9+
phrase_matcher,
10+
)
511
from .text_parser import parse_files
612
from .utils import get_text
713
from .vector_space_alignment import run_vsa

lib/textpair/__main__.py

Lines changed: 51 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,25 @@ def build_graph_and_labels(alignments_file: str, embedding_model: str, llm_param
3434
# Run graph building in separate environment
3535
try:
3636
result = subprocess.run(
37-
[graph_python, "-m", "textpair_graph", "build",
38-
alignments_file, output_dir, "--model", embedding_model],
37+
[
38+
graph_python,
39+
"-m",
40+
"textpair_graph",
41+
"build",
42+
alignments_file,
43+
output_dir,
44+
"--model",
45+
embedding_model,
46+
],
3947
check=True,
40-
capture_output=False
48+
capture_output=False,
4149
)
4250
print("✓ Graph model built successfully!")
4351
except subprocess.CalledProcessError as e:
44-
print(f"ERROR: Graph model generation failed with exit code {e.returncode}", file=sys.stderr)
52+
print(
53+
f"ERROR: Graph model generation failed with exit code {e.returncode}",
54+
file=sys.stderr,
55+
)
4556
return
4657
except FileNotFoundError:
4758
print(f"ERROR: Graph environment not found at {graph_python}", file=sys.stderr)
@@ -55,15 +66,27 @@ def build_graph_and_labels(alignments_file: str, embedding_model: str, llm_param
5566
if os.path.exists(graph_data_path):
5667
try:
5768
result = subprocess.run(
58-
[graph_python, "-m", "textpair_graph", "label",
59-
graph_data_path, "--model", llm_params["llm_model"],
60-
"--context_window", str(llm_params["llm_context_window"]),
61-
"--port", str(llm_params["llm_port"])],
69+
[
70+
graph_python,
71+
"-m",
72+
"textpair_graph",
73+
"label",
74+
graph_data_path,
75+
"--model",
76+
llm_params["llm_model"],
77+
"--context_window",
78+
str(llm_params["llm_context_window"]),
79+
"--port",
80+
str(llm_params["llm_port"]),
81+
],
6282
check=True,
63-
capture_output=False
83+
capture_output=False,
6484
)
6585
except subprocess.CalledProcessError as e:
66-
print(f"WARNING: Cluster labeling failed with exit code {e.returncode}", file=sys.stderr)
86+
print(
87+
f"WARNING: Cluster labeling failed with exit code {e.returncode}",
88+
file=sys.stderr,
89+
)
6790

6891

6992
def delete_database(dbname: str) -> None:
@@ -200,14 +223,24 @@ async def run_alignment(params):
200223
filename = os.listdir(result_batch_path)[0]
201224
os.system(f"mv {result_batch_path}/{filename} {results_file} && rm -rf {result_batch_path}")
202225
else:
203-
print("Merging alignments into one file (this may take a while)... ", end="", flush=True)
226+
print(
227+
"Merging alignments into one file (this may take a while)... ",
228+
end="",
229+
flush=True,
230+
)
204231
merge_command = f"find {result_batch_path} -type f | sort -V | xargs lz4cat --rm | lz4 -q > {results_file}; rm -rf {result_batch_path}"
205232
os.system(merge_command)
206233
print("done.")
207234
count = get_count(os.path.join(params.output_path, "results/count.txt"))
208235

209236
# Postprocessing steps
210-
if any([params.matching_params["phrase_filter"], params.matching_params["banality_auto_detection"], params.matching_params["banality_llm_eval"]]):
237+
if any(
238+
[
239+
params.matching_params["phrase_filter"],
240+
params.matching_params["banality_auto_detection"],
241+
params.matching_params["banality_llm_eval"],
242+
]
243+
):
211244
print(f"\n### Postprocessing {count} pairwise alignments ###")
212245
if params.matching_params["phrase_filter"]:
213246
filtered_passages = phrase_matcher(results_file, params.matching_params["phrase_filter"], count)
@@ -219,7 +252,7 @@ async def run_alignment(params):
219252
banalities_found = banality_auto_detect(
220253
results_file,
221254
params.paths["source"]["common_ngrams"],
222-
f'{params.paths["source"]["ngram_output_path"]}/ngrams_in_order',
255+
f"{params.paths['source']['ngram_output_path']}/ngrams_in_order",
223256
params.matching_params["store_banalities"],
224257
count,
225258
params.matching_params["most_common_ngram_proportion"],
@@ -261,7 +294,7 @@ async def run_alignment(params):
261294
params.passage_classification["classes"],
262295
min_confidence=0.3,
263296
top_k=3,
264-
batch_size=32
297+
batch_size=32,
265298
)
266299

267300
# Passage merger
@@ -293,7 +326,6 @@ async def run_alignment(params):
293326
)
294327

295328

296-
297329
async def run_vsa_similarity(params) -> None:
298330
"""Run vsa similarity"""
299331
if params.paths["target"]["ngram_output_path"] == "": # if path not defined make target like source
@@ -331,7 +363,7 @@ async def run_vsa_similarity(params) -> None:
331363
params.workers,
332364
{**params.preprocessing_params, **params.matching_params},
333365
params.output_path,
334-
params.llm_params
366+
params.llm_params,
335367
)
336368

337369
# Passage classification (if enabled)
@@ -344,7 +376,7 @@ async def run_vsa_similarity(params) -> None:
344376
params.passage_classification["classes"],
345377
min_confidence=0.5,
346378
top_k=3,
347-
batch_size=32
379+
batch_size=32,
348380
)
349381

350382
if params.web_app_config["skip_web_app"] is False:
@@ -368,7 +400,7 @@ async def run_vsa_similarity(params) -> None:
368400
params.web_app_config["source_philo_db_path"],
369401
params.web_app_config["target_philo_db_path"],
370402
params.matching_params["matching_algorithm"],
371-
params
403+
params,
372404
)
373405

374406

@@ -432,4 +464,5 @@ async def main():
432464

433465
if __name__ == "__main__":
434466
import asyncio
467+
435468
asyncio.run(main())

0 commit comments

Comments
 (0)