Skip to content

Commit 981cbb4

Browse files
committed
Refactor batch processing in git exports: enhance batch handling, improve branch checkout logic, and streamline file processing
1 parent 73ed665 commit 981cbb4

3 files changed

Lines changed: 107 additions & 52 deletions

File tree

exporters/git/batch_processor.py

Lines changed: 63 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,19 @@
66
"""
77

88
import os
9-
import shutil
10-
import subprocess
119
from datetime import datetime
1210
from pathlib import Path
1311
import random
1412
import json
1513

1614
from exporters.git import clone_target_repository_to_temp
17-
from exporters.git.git_utils import GIT_TIMEOUT
15+
from exporters.git.git_utils import checkout_branch, push_to_target_repository
16+
from sfs_processor import make_document
1817

1918

20-
def process_files_with_git_batch(json_files, output_dir, verbose, predocs):
21-
"""Process files with git batch workflow."""
22-
# Clone target repository once for all documents
19+
def process_files_with_git_batch(json_files, output_dir, verbose, predocs, batch_size=10):
20+
"""Process files with git batch workflow, using same branch but pushing after each batch."""
21+
# Clone target repository once for all batches
2322
repo_dir, original_cwd = clone_target_repository_to_temp(verbose=verbose)
2423
if repo_dir is None:
2524
raise RuntimeError("Failed to clone target repository")
@@ -28,60 +27,75 @@ def process_files_with_git_batch(json_files, output_dir, verbose, predocs):
2827
# Change to cloned repository directory
2928
os.chdir(repo_dir)
3029

31-
# Create unique branch name for this batch
30+
# Create unique branch name for this entire operation
3231
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
3332
random_suffix = random.randint(1000, 9999)
3433
unique_branch = f"batch_{timestamp}_{random_suffix}"
3534

36-
# Create and checkout new branch directly
37-
try:
38-
subprocess.run(['git', 'checkout', '-b', unique_branch],
39-
check=True, capture_output=True, timeout=GIT_TIMEOUT)
40-
if verbose:
41-
print(f"Skapade och bytte till branch '{unique_branch}' för batch-commits")
42-
except subprocess.CalledProcessError as e:
43-
print(f"Fel: Kunde inte skapa git branch: {e}")
35+
# Create and checkout new branch
36+
if not checkout_branch(unique_branch, create_if_missing=True, verbose=verbose):
37+
print(f"Fel: Kunde inte skapa git branch: {unique_branch}")
4438
return
4539

46-
# Process each JSON file
47-
from sfs_processor import make_document
48-
for json_file in json_files:
49-
# Use absolute path since we changed working directory
50-
abs_json_file = Path(original_cwd) / json_file
51-
try:
52-
with open(abs_json_file, 'r', encoding='utf-8') as f:
53-
data = json.load(f)
54-
except (json.JSONDecodeError, FileNotFoundError) as e:
55-
print(f"Fel vid läsning av {abs_json_file}: {e}")
56-
continue
57-
58-
# Create documents in the cloned repository AND save to original output directory
59-
# First convert to absolute path since we changed working directory
60-
original_output_dir = Path(original_cwd) / Path(output_dir).name if not Path(output_dir).is_absolute() else Path(output_dir)
61-
make_document(data, original_output_dir, ["git"], True, verbose, True, predocs, True)
62-
63-
# Push all commits to target repository
64-
if verbose:
65-
print(f"Pushar batch till target repository...")
66-
67-
from exporters.git.git_utils import push_to_target_repository
68-
if push_to_target_repository(unique_branch, 'origin', verbose):
69-
print(f"Batch pushad till target repository som branch '{unique_branch}'")
40+
# Split files into batches
41+
total_files = len(json_files)
42+
if total_files > batch_size:
43+
print(f"Delar upp {total_files} filer i batcher om {batch_size} filer var")
44+
batches = [json_files[i:i + batch_size] for i in range(0, total_files, batch_size)]
45+
print(f"Skapade {len(batches)} batcher")
46+
47+
# Process each batch in the same repository and branch, pushing after each
48+
for i, batch in enumerate(batches, 1):
49+
print(f"\nBearbetar batch {i}/{len(batches)} ({len(batch)} filer)...")
50+
_process_batch_files(batch, output_dir, verbose, predocs, original_cwd, i, len(batches))
51+
52+
# Push after each batch
53+
print(f"Pushar batch {i}/{len(batches)} till target repository...")
54+
if push_to_target_repository(unique_branch, 'origin', verbose):
55+
print(f"Batch {i}/{len(batches)} pushad till target repository som branch '{unique_branch}'")
56+
else:
57+
print(f"Misslyckades med att pusha batch {i}/{len(batches)} till target repository")
7058
else:
71-
print(f"Misslyckades med att pusha batch till target repository")
59+
print(f"Bearbetar {total_files} filer i en enda batch...")
60+
_process_batch_files(json_files, output_dir, verbose, predocs, original_cwd, 1, 1)
61+
62+
# Push the single batch
63+
print(f"Pushar alla {total_files} filer till target repository...")
64+
if push_to_target_repository(unique_branch, 'origin', verbose):
65+
print(f"Alla {total_files} filer pushade till target repository som branch '{unique_branch}'")
66+
else:
67+
print(f"Misslyckades med att pusha till target repository")
7268

73-
except subprocess.CalledProcessError as e:
74-
print(f"Fel vid git batch processing: {e}")
75-
if hasattr(e, 'stderr') and e.stderr:
76-
print(f"Git stderr: {e.stderr.decode('utf-8', errors='replace')}")
7769
except Exception as e:
7870
print(f"Oväntat fel vid git batch processing: {e}")
7971
finally:
8072
# Always change back to original directory
8173
os.chdir(original_cwd)
82-
# Clean up temporary directory
74+
75+
76+
def _process_batch_files(json_files, output_dir, verbose, predocs, original_cwd, batch_num, total_batches):
77+
"""Process batch files in the current repository without creating new branches."""
78+
# Process each JSON file in the current git repository
79+
from sfs_processor import make_document
80+
for json_file in json_files:
81+
# Use absolute path since we changed working directory
82+
abs_json_file = Path(original_cwd) / json_file
8383
try:
84-
shutil.rmtree(repo_dir.parent)
85-
except Exception as e:
86-
if verbose:
87-
print(f"Varning: Kunde inte rensa temporär katalog: {e}")
84+
with open(abs_json_file, 'r', encoding='utf-8') as f:
85+
data = json.load(f)
86+
except (json.JSONDecodeError, FileNotFoundError) as e:
87+
print(f"Fel vid läsning av {abs_json_file}: {e}")
88+
continue
89+
90+
# Create documents in the cloned repository AND save to original output directory
91+
# First convert to absolute path since we changed working directory
92+
if not Path(output_dir).is_absolute():
93+
original_output_dir = Path(original_cwd) / Path(output_dir).name
94+
else:
95+
original_output_dir = Path(output_dir)
96+
make_document(data, original_output_dir, ["git"], True, verbose, True, predocs, True)
97+
98+
if verbose:
99+
print(f"Batch {batch_num}/{total_batches} bearbetad ({len(json_files)} filer)")
100+
101+

exporters/git/git_utils.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,49 @@ def stage_file(file_path: str, verbose: bool = False) -> bool:
359359
return False
360360

361361

362+
def checkout_branch(branch_name: str, create_if_missing: bool = True, verbose: bool = False) -> bool:
363+
"""
364+
Checkout to a git branch, optionally creating it if it doesn't exist.
365+
366+
Args:
367+
branch_name: Name of the branch to checkout
368+
create_if_missing: If True, create the branch if it doesn't exist
369+
verbose: Enable verbose output
370+
371+
Returns:
372+
bool: True if checkout was successful, False otherwise
373+
"""
374+
try:
375+
# Try to checkout the branch first
376+
result = subprocess.run(['git', 'checkout', branch_name],
377+
capture_output=True, timeout=GIT_TIMEOUT)
378+
379+
if result.returncode == 0:
380+
if verbose:
381+
print(f"Bytte till branch '{branch_name}'")
382+
return True
383+
elif create_if_missing:
384+
# Branch doesn't exist, create it
385+
subprocess.run(['git', 'checkout', '-b', branch_name],
386+
check=True, capture_output=True, timeout=GIT_TIMEOUT)
387+
if verbose:
388+
print(f"Skapade och bytte till branch '{branch_name}'")
389+
return True
390+
else:
391+
if verbose:
392+
print(f"Branch '{branch_name}' finns inte och create_if_missing=False")
393+
return False
394+
395+
except subprocess.CalledProcessError as e:
396+
print(f"Fel vid checkout av branch '{branch_name}': {e}")
397+
if hasattr(e, 'stderr') and e.stderr:
398+
print(f"Git stderr: {e.stderr.decode('utf-8', errors='replace')}")
399+
return False
400+
except FileNotFoundError:
401+
print("Varning: Git hittades inte.")
402+
return False
403+
404+
362405
def create_commit_with_date(message: str, date: str, verbose: bool = False) -> bool:
363406
"""
364407
Create a git commit with a specified date.

sfs_processor.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from typing import Dict, Any, List, Optional
2424

2525
from downloaders.riksdagen_api import fetch_predocs_details, format_predocs_for_frontmatter
26-
from exporters.git.git_utils import GIT_TIMEOUT
2726
from formatters.format_sfs_text import (
2827
format_sfs_text_as_markdown,
2928
parse_logical_sections,
@@ -35,14 +34,13 @@
3534
from formatters.add_pdf_url_to_frontmatter import generate_pdf_url
3635
from formatters.frontmatter_manager import add_ikraft_datum_to_frontmatter
3736
from temporal.title_temporal import title_temporal
38-
from temporal.amendments import process_markdown_amendments, extract_amendments
37+
from temporal.amendments import extract_amendments
3938
from temporal.apply_temporal import apply_temporal
4039
from exporters.git import create_init_git_commit
4140
from util.yaml_utils import format_yaml_value
4241
from util.datetime_utils import format_datetime
4342
from util.file_utils import filter_json_files, save_to_disk
4443
from formatters.predocs_parser import parse_predocs_string
45-
from formatters.table_converter import convert_tables_in_markdown
4644

4745

4846
def create_safe_filename(beteckning: str, preserve_section_tags: bool = False) -> str:

0 commit comments

Comments
 (0)