-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbibtex_processor.py
More file actions
128 lines (101 loc) · 5.53 KB
/
bibtex_processor.py
File metadata and controls
128 lines (101 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import asyncio
from typing import Optional
from pathlib import Path
import time
from datetime import datetime
import logging
from metadata_enhancer import BibTexEnhancer
from doi_converter import DOIConverter
class BibTexProcessor:
"""Combined service to enhance and convert BibTeX entries"""
def __init__(self,
crossref_api_key: Optional[str] = None,
semantic_scholar_api_key: Optional[str] = None,
acm_api_key: Optional[str] = None,
log_dir: str = "logs",
output_dir: str = "bibtex_output"):
self.enhancer = BibTexEnhancer(
crossref_api_key=crossref_api_key,
semantic_scholar_api_key=semantic_scholar_api_key,
acm_api_key=acm_api_key,
log_dir=log_dir,
output_dir=output_dir
)
self.converter = DOIConverter(
log_dir=log_dir,
output_dir=output_dir
)
self.setup_logging(log_dir)
def setup_logging(self, log_dir: str):
"""Setup combined logging"""
Path(log_dir).mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Setup combined logger
self.combined_logger = logging.getLogger('combined_logger')
self.combined_logger.setLevel(logging.INFO)
combined_handler = logging.FileHandler(f"{log_dir}/combined_process_{timestamp}.log")
combined_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
self.combined_logger.addHandler(combined_handler)
async def process_bibtex_file(self, file_path: str, is_async: bool = False) -> str:
"""Process BibTeX file through both enhancement and DOI conversion"""
start_time = time.time()
try:
self.combined_logger.info(f"Starting processing of file: {file_path}")
# Check if file exists
if not Path(file_path).exists():
raise FileNotFoundError(f"Input file not found: {file_path}")
self.combined_logger.info("Starting metadata enhancement process...")
enhanced_file = await self.enhancer.enhance_bibtex_file(file_path, is_async)
enhancement_rate = (self.enhancer.successful_matches / self.enhancer.total_entries * 100
if self.enhancer.total_entries > 0 else 0)
self.combined_logger.info("Starting DOI conversion process...")
final_file = await self.converter.convert_bibtex_file(enhanced_file, is_async)
conversion_rate = (self.converter.successful_conversions / self.converter.total_entries * 100
if self.converter.total_entries > 0 else 0)
end_time = time.time()
total_time = end_time - start_time
# Log combined statistics
self.combined_logger.info("\n=== Combined Processing Statistics ===")
self.combined_logger.info(f"Total processing time: {total_time:.2f}s")
self.combined_logger.info(f"Total entries processed: {self.enhancer.total_entries}")
self.combined_logger.info(f"Enhanced entries: {self.enhancer.successful_matches} ({enhancement_rate:.1f}%)")
self.combined_logger.info(f"Converted DOIs: {self.converter.successful_conversions} ({conversion_rate:.1f}%)")
self.combined_logger.info(f"Input file: {file_path}")
self.combined_logger.info(f"Intermediate file: {enhanced_file}")
self.combined_logger.info(f"Final output file: {final_file}")
# Log provider-specific success rates
self.combined_logger.info("\n=== Provider Success Rates ===")
for provider_name, count in self.enhancer.provider_stats.items():
attempts = len(self.enhancer.provider_times[provider_name])
if attempts > 0:
success_rate = (count / attempts) * 100
self.combined_logger.info(f"{provider_name}: {success_rate:.1f}% ({count}/{attempts})")
return final_file
except Exception as e:
self.combined_logger.error(f"Error during processing: {str(e)}")
raise
def calculate_overall_success_rate(self) -> float:
"""Calculate the overall success rate of the entire process"""
if self.enhancer.total_entries == 0:
return 0.0
# Consider success if either enhancement or DOI conversion worked
total_successes = max(self.enhancer.successful_matches, self.converter.successful_conversions)
return (total_successes / self.enhancer.total_entries) * 100
# Example usage
if __name__ == "__main__":
processor = BibTexProcessor()
async def main(async_mode: bool = False):
try:
start_time = time.time()
final_bibtex = await processor.process_bibtex_file('thesis.bib', is_async=async_mode)
end_time = time.time()
print(f"\nProcessing completed:")
print(f"{'Async' if async_mode else 'Sync'} mode time: {end_time - start_time:.2f} seconds")
print(f"Overall success rate: {processor.calculate_overall_success_rate():.1f}%")
print(f"Final output written to: {final_bibtex}")
except Exception as e:
print(f"Error: {str(e)}")
return 1
return 0
exit_code = asyncio.run(main(async_mode=True))
exit(exit_code)