-
Notifications
You must be signed in to change notification settings - Fork 1
style: make_units, launch_runs #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
7PintsOfCherryGarcia
wants to merge
6
commits into
GeoGenetics:main
Choose a base branch
from
7PintsOfCherryGarcia:julian_dev
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
2498b61
Added aditional options
7PintsOfCherryGarcia 62c04ee
added --account and --jobs parameters
7PintsOfCherryGarcia a3fb3d7
Improve make_units robustness for minimal input paths and missing dep…
7PintsOfCherryGarcia cb44822
Addressed PR #29
7PintsOfCherryGarcia 58d187f
Resolving second round of comments
7PintsOfCherryGarcia 64724de
Resolving third round of comments
7PintsOfCherryGarcia File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -42,7 +42,7 @@ def gzip_n_lines(in_gzip): | |||
| "--in-regex", | ||||
| action="store", | ||||
| type=str, | ||||
| default="^(?!Undetermined).*.(sam|bam|cram|fastq|fq)(.gz)?$", | ||||
| default="^(?!Undetermined).*.(sam|bam|fastq|fq)(.gz)?$", | ||||
| help="Regex to filter input files", | ||||
| ) | ||||
| parser.add_argument( | ||||
|
|
@@ -91,6 +91,7 @@ def gzip_n_lines(in_gzip): | |||
| action="store", | ||||
| nargs="+", | ||||
| default=[ | ||||
| f"date={pd.Timestamp.today().normalize().strftime('%Y-%m-%d')}", | ||||
| "sample=Lib", | ||||
| "material=DNA", | ||||
| "flowcell_pos=X", | ||||
|
|
@@ -173,7 +174,11 @@ def gzip_n_lines(in_gzip): | |||
| else: | ||||
| args.extra_file = Path(args.extra_file) | ||||
| extra_file_name = args.extra_file.name | ||||
| assert args.extra_file.exists(), "Extra file does not exist." | ||||
| if not args.extra_file.exists(): | ||||
| parser.error( | ||||
| "Extra file does not exist: " | ||||
| f"{args.extra_file}. Pass --extra-file name:/path/to/file with a valid path." | ||||
| ) | ||||
|
|
||||
|
|
||||
| # Add extra metadata | ||||
|
|
@@ -252,30 +257,39 @@ def gzip_n_lines(in_gzip): | |||
| exit(0) | ||||
|
|
||||
|
|
||||
| ###################### | ||||
| ### FORMAT COLUMNS ### | ||||
| ###################### | ||||
| # Add metadata | ||||
| for metadata_default in args.metadata_default: | ||||
| if metadata_default.find("=") > 0: | ||||
| key, value = metadata_default.split("=") | ||||
| if key not in units: | ||||
| units[key] = value | ||||
|
|
||||
|
|
||||
| ###################### | ||||
| ### FORMAT COLUMNS ### | ||||
| ###################### | ||||
| # Format date column (if present) | ||||
| if "date" in units.columns.values: | ||||
| units["date"] = pd.to_datetime(units["date"]) | ||||
|
|
||||
|
|
||||
| # Remove adapters if file SAM/BAM/CRAM | ||||
| units.loc[units.data.str.contains(r"\.(?:sam|bam|cram)$"), "adapters"] = pd.NA | ||||
|
|
||||
| # Add missing fields used in out-path formatting. | ||||
| missing_out_path_fields = [ | ||||
| key for key in out_path_wildcards if key not in units.columns.values | ||||
| ] | ||||
| if missing_out_path_fields: | ||||
| logging.warning( | ||||
| "Missing out-path fields in parsed metadata: %s. Applying defaults.", | ||||
| ", ".join(sorted(missing_out_path_fields)), | ||||
| ) | ||||
|
|
||||
| # Fix invalid values | ||||
| fix_cols = units.columns.drop("data") | ||||
| units[fix_cols] = units[fix_cols].replace(args.rm_chars, value="", regex=True) | ||||
|
|
||||
| # Normalize date column after metadata/default injection. | ||||
| if "date" in units.columns.values: | ||||
| units["date"] = pd.to_datetime(units["date"], errors="coerce") | ||||
| if units["date"].isna().any(): | ||||
| logging.warning( | ||||
| "Some date values could not be parsed. Using today's date for invalid rows." | ||||
| ) | ||||
| units.loc[units["date"].isna(), "date"] = out_path_defaults["date"] | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels a bit dangerous.
Suggested change
|
||||
|
|
||||
|
|
||||
| # Fix seq_type info and collapse | ||||
| if "read" in units: | ||||
|
|
@@ -311,26 +325,11 @@ def gzip_n_lines(in_gzip): | |||
| # Add workflow_ver (current workflow version), if present in out_path | ||||
| if "workflow_ver" in out_path_wildcards: | ||||
| import git | ||||
|
|
||||
| repo = git.Repo(Path(__file__).resolve(strict=True).parent.parent) | ||||
| commits = pd.DataFrame( | ||||
| [[commit.hexsha, commit.committed_date] for commit in repo.iter_commits()], | ||||
| columns=["hexsha", "date"], | ||||
| ).sort_values(by="date") | ||||
| tags = pd.DataFrame( | ||||
| [[tag.commit.hexsha, tag.name] for tag in repo.tags], columns=["hexsha", "tag"] | ||||
| ) | ||||
| commits = pd.merge(commits, tags, how="left", on="hexsha") | ||||
| # if no tag, use commit hexsha | ||||
| commits["tag"] = commits["tag"].fillna(commits["hexsha"]) | ||||
|
|
||||
| # Sanity check | ||||
| commits_no_tag = commits[commits.tag.str.len() == 40] | ||||
| assert all( | ||||
| commits_no_tag["hexsha"].eq(commits_no_tag["tag"]) | ||||
| ), "Commits HEX SHA do not match!" | ||||
|
|
||||
| units["workflow_ver"] = commits.iloc[-1]["tag"] | ||||
| try: | ||||
| units["workflow_ver"] = repo.git.describe("--tags", "--exact-match") | ||||
| except Exception: | ||||
| units["workflow_ver"] = repo.head.commit.hexsha | ||||
|
|
||||
|
|
||||
| # Reorder columns | ||||
|
|
@@ -391,7 +390,7 @@ def gzip_n_lines(in_gzip): | |||
| logging.debug(pd.concat(out_stats)) | ||||
|
|
||||
| with open(args.out_stats, "x") as out_stat_fh: | ||||
| np.set_printoptions(legacy="1.21") | ||||
| np.set_printoptions(legacy="1.25") | ||||
| out_stat_fh.write(f"# {args}\n") | ||||
| pd.concat(out_stats).dropna(axis=1, how="all").to_csv( | ||||
| out_stat_fh, | ||||
|
|
@@ -410,7 +409,7 @@ def gzip_n_lines(in_gzip): | |||
| # Create folders | ||||
| out_path.mkdir(parents=True, exist_ok=args.force) | ||||
| # Save units.tsv file | ||||
| units.drop(["extra_file_md5"], axis=1).dropna(axis=1, how="all").to_csv( | ||||
| units.dropna(axis=1, how="all").to_csv( | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why include the config md5 in |
||||
| out_path / "units.tsv", | ||||
| sep="\t", | ||||
| index=False, | ||||
|
|
||||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Won;t this clash with lines 263-268?