Skip to content

Commit 937b057

Browse files
committed
fix requirements deduplication, category extraction separators and reconciliation errors in some repos. Fixes #980
1 parent c3b642b commit 937b057

15 files changed

Lines changed: 2915 additions & 6 deletions

File tree

src/somef/export/json_export.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,14 @@ def unify_results(repo_data: dict) -> dict:
974974
else:
975975
canonical = canonicalize_value(value, value_type)
976976
key = str(canonical)
977+
elif category == constants.CAT_REQUIREMENTS:
978+
req_name = result.get("name", "").strip().lower()
979+
req_version = result.get("version", "").strip()
980+
if req_name:
981+
key = f"REQ-{req_name}-{req_version}"
982+
else:
983+
canonical = canonicalize_value(value, value_type)
984+
key = str(canonical)
977985
else:
978986
# Normal behavior for the rest of the categories
979987
canonical = canonicalize_value(value, value_type)

src/somef/header_analysis.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,25 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]:
138138

139139
content, none_header_content = mardown_parser.extract_content_per_header(text, headers)
140140
parents = mardown_parser.extract_headers_parents(text)
141+
142+
# parent_list = [parents.get(h) for h in header_list]
143+
144+
content_map = dict(zip(header_list, content))
141145

146+
aligned_content = [content_map[h] for h in header_list]
147+
aligned_parents = [parents.get(h) for h in header_list]
148+
149+
# df = pd.DataFrame({
150+
# 'Header': header_list,
151+
# 'Content': content,
152+
# 'ParentHeader': [parents.get(h) for h in header_list],
153+
# })
154+
142155
df = pd.DataFrame({
143156
'Header': header_list,
144-
'Content': content,
145-
'ParentHeader': [parents.get(h) for h in header_list],
157+
'Content': aligned_content,
158+
'ParentHeader': aligned_parents,
146159
})
147-
148160
# df['Content'].replace('', np.nan, inplace=True)
149161
df['Content'] = df['Content'].replace('', np.nan)
150162
df = df.dropna(subset=['Content'])

src/somef/parser/mardown_parser.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,13 @@ def extract_headers(original_text):
1919
if is_header(line):
2020
#text = re.sub(regex, '', line)
2121
text = get_tag_content(line)
22+
# Check if the header is actually a separator as -----------
23+
if is_separator_header(text):
24+
index += 1
25+
continue
2226
if index + 1 >= limit:
2327
output[text] = True
24-
elif not splitted[index + 1].startswith("<h"):
28+
elif not splitted[index + 1].startswith("<h") or not is_header(splitted[index + 1]):
2529
output[text] = True
2630
else:
2731
output[text] = False
@@ -40,10 +44,23 @@ def extract_headers_with_tags(original_text):
4044
line = splitted[index]
4145
if line.startswith("<h"):
4246
if is_header(line):
47+
# ignrore separators
48+
if is_separator_header(get_tag_content(line)):
49+
index += 1
50+
continue
4351
output.append(replace_html_tags(line))
4452
index += 1
4553
return output
4654

55+
def is_separator_header(text):
56+
"""Detects false headers generated by separators (--- or ===)
57+
that are incorrectly converted to <h2> elements by the Markdown parser."""
58+
return re.match(r'^[-=]+$', text.strip()) is not None
59+
60+
def remove_separator_lines(text: str) -> str:
61+
"""Removes separator lines (--- or ===) from extracted content."""
62+
lines = text.split('\n')
63+
return '\n'.join(line for line in lines if not is_separator_header(line))
4764

4865
def extract_content_per_header(original_text, headers):
4966
keys = list(headers.keys())
@@ -86,7 +103,9 @@ def extract_content_per_header(original_text, headers):
86103
header_content = get_text(top_index + offset, -1, text_tokenized)
87104
if header_content.startswith('\n'):
88105
header_content = header_content[1:]
89-
content.append(header_content)
106+
if headers[top]:
107+
content.append(header_content)
108+
# content.append(header_content)
90109
output[top] = header_content
91110
return content, none_header_content
92111

@@ -120,7 +139,7 @@ def get_text(init_index, end_index, text_tokenized):
120139
while init_index < end_index:
121140
output = output + '\n' + text_tokenized[init_index]
122141
init_index += 1
123-
return output
142+
return remove_separator_lines(output)
124143

125144

126145
def extract_bash(text):

src/somef/test/test_JSON_export.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,5 +874,133 @@ def test_issue_953_publication_reconciliation(self):
874874
assert bibtex_cit is not None, "Bibtex citation (Text_excerpt) not found"
875875
assert bibtex_cit["result"]["format"] == "bibtex"
876876
assert bibtex_cit["result"]["year"] == "2019"
877+
878+
os.remove(output_path)
879+
880+
881+
def test_issue_reconciliation_workloop(self):
882+
"""Checks that key metadata is correctly extracted and reconciled from codemeta.json and README.
883+
Issue with canolicalization ofvalues"""
884+
885+
output_path = test_data_path + "test_issue_reconciliation_workloopr.json"
886+
887+
somef_cli.run_cli(threshold=0.8,
888+
ignore_classifiers=False,
889+
repo_url=None,
890+
local_repo=test_data_repositories + "workloopr",
891+
doc_src=None,
892+
in_file=None,
893+
output=output_path,
894+
graph_out=None,
895+
graph_format="turtle",
896+
codemeta_out=None,
897+
pretty=True,
898+
missing=False,
899+
readme_only=False)
900+
901+
with open(output_path, "r") as text_file:
902+
json_content = json.loads(text_file.read())
903+
904+
assert constants.CAT_AUTHORS in json_content
905+
authors = json_content[constants.CAT_AUTHORS]
906+
assert len(authors) == 2
907+
author_names = [a["result"]["value"] for a in authors]
908+
assert "Vikram B. Baliga" in author_names
909+
assert "Shreeram Senthivasan" in author_names
910+
911+
assert constants.CAT_INSTALLATION in json_content
912+
installation = json_content[constants.CAT_INSTALLATION]
913+
assert any("devtools::install_github" in i["result"]["value"] for i in installation)
914+
915+
assert constants.CAT_CITATION in json_content
916+
citations = json_content[constants.CAT_CITATION]
917+
# print(citations)
918+
assert any("workloopR" in c["result"]["value"] for c in citations)
919+
920+
os.remove(output_path)
921+
922+
923+
924+
def test_issue_reconciliation_cropwater(self):
925+
"""Checks that key metadata is correctly extracted and reconciled from codemeta.json and README.
926+
Issue with canolicalization ofvalues"""
927+
928+
output_path = test_data_path + "test_issue_reconciliation_cropwater.json"
929+
930+
somef_cli.run_cli(threshold=0.8,
931+
ignore_classifiers=False,
932+
repo_url=None,
933+
local_repo=test_data_repositories + "cropwater",
934+
doc_src=None,
935+
in_file=None,
936+
output=output_path,
937+
graph_out=None,
938+
graph_format="turtle",
939+
codemeta_out=None,
940+
pretty=True,
941+
missing=False,
942+
readme_only=False)
943+
944+
with open(output_path, "r") as text_file:
945+
json_content = json.loads(text_file.read())
877946

947+
assert constants.CAT_AUTHORS in json_content
948+
authors = json_content[constants.CAT_AUTHORS]
949+
assert len(authors) == 5
950+
author_names = [a["result"]["value"] for a in authors]
951+
assert "Gabriel Constantino Blain" in author_names
952+
assert "Adam H. Sparks" in author_names
953+
954+
955+
assert constants.CAT_INSTALLATION in json_content
956+
installation = json_content[constants.CAT_INSTALLATION]
957+
assert any("gabrielblain/CropWaterBalance" in i["result"]["value"] for i in installation)
958+
959+
assert constants.CAT_LICENSE in json_content
960+
licenses = json_content[constants.CAT_LICENSE]
961+
spdx = next((l for l in licenses if l["result"].get("spdx_id") == "MIT"), None)
962+
assert spdx is not None, "MIT license not found"
963+
964+
assert constants.CAT_DESCRIPTION in json_content
965+
878966
os.remove(output_path)
967+
968+
969+
def test_issue_980_reconciliation_requirements(self):
970+
"""Checks that requirements with the same name and version are correctly reconciled
971+
into a single entry merging sources, even when their raw values are not the same
972+
(ej. 'pracma (>= 2.0.7)' vs 'pracma==>= 2.0.7')."""
973+
974+
output_path = test_data_path + "test_issue_980_reconciliation_requirements.json"
975+
976+
somef_cli.run_cli(threshold=0.8,
977+
ignore_classifiers=False,
978+
repo_url=None,
979+
local_repo=test_data_repositories + "ggstatsplot",
980+
doc_src=None,
981+
in_file=None,
982+
output=output_path,
983+
graph_out=None,
984+
graph_format="turtle",
985+
codemeta_out=None,
986+
pretty=True,
987+
missing=False,
988+
readme_only=False)
989+
990+
with open(output_path, "r") as text_file:
991+
json_content = json.loads(text_file.read())
992+
993+
requirements = json_content[constants.CAT_REQUIREMENTS]
994+
# print(requirements)
995+
996+
seen_keys = [(r["result"]["name"], r["result"].get("version", "")) for r in requirements]
997+
unique_keys = set(seen_keys)
998+
999+
assert len(seen_keys) == len(unique_keys), (
1000+
f"Duplicate requirements found: "
1001+
f"{[k for k in unique_keys if seen_keys.count(k) > 1]}"
1002+
)
1003+
1004+
os.remove(output_path)
1005+
1006+
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
2+
![til](https://github.com/BM32ESRF/LaueNN/blob/main/docs/idea_lauenn/frames_medres.gif)
3+
4+
[![Conda](https://img.shields.io/conda/pn/bm32esrf/lauetoolsnn?color=green&label=supported%20platform)](https://anaconda.org/bm32esrf/lauetoolsnn)
5+
[![GitHub tag (latest by date)](https://img.shields.io/github/v/tag/BM32ESRF/LaueNN?color=blue&label=Github%20tag)](https://github.com/BM32ESRF/LaueNN)
6+
7+
[![Lint, test, build, and publish](https://github.com/BM32ESRF/LaueNN/actions/workflows/complete_workflow.yml/badge.svg)](https://github.com/BM32ESRF/LaueNN/actions/workflows/complete_workflow.yml)
8+
[![PyPI](https://img.shields.io/pypi/v/lauetoolsnn)](https://pypi.python.org/pypi/lauetoolsnn/)
9+
[![PyPI pyversions](https://img.shields.io/pypi/pyversions/lauetoolsnn.svg)](https://pypi.python.org/pypi/lauetoolsnn/)
10+
11+
[![Anaconda-Server Badge](https://anaconda.org/bm32esrf/lauetoolsnn/badges/license.svg)](https://anaconda.org/bm32esrf/lauetoolsnn)
12+
[![Conda](https://img.shields.io/conda/v/bm32esrf/lauetoolsnn?style=flat-square)](https://anaconda.org/bm32esrf/lauetoolsnn)
13+
14+
15+
[![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/BM32ESRF/LaueNN/issues)
16+
17+
18+
# lauetoolsnn/LaueNN
19+
An autonomous feed-forward neural network (FFNN) model to predict the HKL in single/multi-grain/multi-phase Laue patterns with high efficiency and accuracy is introduced.
20+
21+
Laue diffraction indexation (especially Laue images comprising of diffraction signal from several polycrystals/multi phase materials) can be a very tedious and CPU intensive process. To takle this, LaueNN or LauetoolsNN was developed employing the power of neural network to speed up a part of the indexation process. In the LaueNN_presentation (https://github.com/BM32ESRF/LaueNN/tree/main/presentations/LaueNN_presentation.pdf), several steps of Laue pattern indexation with classical approach is described. We have replaced the most CPU intensive step with the Neural Networks. The step where the Laue indices hkl of each spot os now determined with the Neural networks, alongside the spot hkl index, the neural network also predicts the Material that spot belongs to. This can be useful incase of Laue images comprising of diffraction signal from multi-phases.
22+
LaueNN uses the existing modules of Lauetools to generate simulated Laue patterns. The whole workflow and the application of this tool is illustrated in this article (https://onlinelibrary.wiley.com/iucr/doi/10.1107/S1600576722004198)
23+
24+
For classical indexation of Laue pattern (GUI and scripts), check out the sister package: https://github.com/BM32ESRF/lauetools
25+
26+
27+
### Video tutorial
28+
------------------------------
29+
- Video 1: Working with jupyter notebook scripts : https://cloud.esrf.fr/s/6q4DJfAn7K46BGN
30+
- Video 2: Working with lauetoolsnn GUI : https://cloud.esrf.fr/s/AeGow4CoqZRJiyx
31+
32+
33+
### Requirements: (latest version of each libraries accessed on 03/04/2022)
34+
------------------------------
35+
- PyQt5 (GUI)
36+
- matplotlib
37+
- Keras
38+
- tensorflow
39+
- numpy
40+
- scipy (scipy transform rotation is used)
41+
- h5py (required for writing neural network model files)
42+
- scikit-learn (required for generating trained model classification reports)
43+
- fabio (used for opening raw Laue tiff images)
44+
- networkx (to be replaced with numpy in the future)
45+
- scikit-image (used for hough based analysis of Laue patterns)
46+
- tqdm (required only for notebook scripts)
47+
- opencv (for LOG based peak search)
48+
- pandas and pytables (for writing pickle to h5)
49+
50+
51+
### Installation
52+
------------------------------
53+
Lauetoolsnn can be installed either via PYPI usiing the following command in terminal (this installs all dependencies automatically):
54+
55+
https://pypi.org/project/lauetoolsnn/
56+
57+
https://anaconda.org/bm32esrf/lauetoolsnn
58+
59+
``` bash
60+
$ pip install lauetoolsnn
61+
$ or
62+
$conda install -c bm32esrf lauetoolsnn -c conda-forge
63+
```
64+
For macOS user, please use the conda installation to avoid build errors or can be compiled and installed locally via the setup.py file. Download the Github repository and type the following in terminal. In this case, the dependencies has to be installed manually. The latest version of each dependency works as of (01/04/2022).
65+
``` bash
66+
$ python setup.py install
67+
```
68+
69+
See procedure_usage_lauetoolsnn.pdf for installation and how to write the configuration file to be used with GUI.
70+
This project is also hosted on sourceforge.net https://lauetoolsnn.sourceforge.io
71+
72+
73+
### Documentation
74+
------------------------------
75+
Documentation (under construction) for lauetoolsnn/lauenn is on the following webpage
76+
https://lauenn.readthedocs.io/en/latest/
77+
78+
79+
### Example case
80+
------------------------------
81+
Two example case studies are included in the lauetoolsnn\examples folder.
82+
Run the GUI by either launching directly from the terminal using the 'lauetoolsnn' command or by running it locally with python lauetoolsneuralnetwork.py command.
83+
84+
First step is to load the config.txt from the example folder, it sets all the values of the GUI to the case study.
85+
In the GUI:
86+
- Step1: File --> load config . Select the config file from the example directory.
87+
- Step1a: If config file is not available, one can set parameters in the configure parameters window directly.
88+
- Step2: Press the configure parameters button and press Accept button at the end (the values are loaded from the config file).
89+
- Step3: Press Generate Training dataset button. This will generate the training and validation dataset for neural network.
90+
- Step4: Press Train Neural network button. This will start the training process and once finished will save the trained model.
91+
- Step5: Press the Live prediction with IPF map to start the prediction on predefined experimental dataset. Example datafile is included in the examples folder.
92+
- Step6: Once analyzed, the results can be saved using the save results button.
93+
94+
In addition, all the above mentioned steps can be done without the GUI and are detailed in the lauetoolsnn\example_notebook_scripts folder.
95+
Jupyter notebook scripts are provided to run all the steps sequentially.
96+
97+
The indexed orientation matrix is also written in ".ctf" format, which can then be opened with channel 5 Aztec or MTEX software to do post processing related to orientations analysis. MTEX post processing script is also included in the lauetoolsnn\util_script\MTEX_plot.m
98+
99+
100+
### Citation
101+
------------------------------
102+
If you use this software, please cite it using the metadata available in the citation_bibtex.cff file in root.
103+
``` bash
104+
Purushottam Raj Purohit, R. R. P., Tardif, S., Castelnau, O., Eymery, J., Guinebretiere, R., Robach, O., Ors, T. & Micha, J.-S. (2022). J. Appl. Cryst. 55, 737-750.
105+
```
106+
107+
108+
### Known Issues
109+
------------------------------
110+
So far, there is a issue with H5py and HDF5 version in the windows installation with conda. If error with H5py version mismatch exist after conda installation, please try "pip install lauetoolsnn" on windows as this should not have this problem. The other possibility is to install the H5py with pip before or after installing lauetoolsnn with conda.
111+
112+
113+
### Support
114+
------------------------------
115+
Do not hesitate to contact the development team at [purushot@esrf.fr](mailto:purushot@esrf.fr) or [micha@esrf.fr](mailto:micha@esrf.fr).
116+
117+
### Funding
118+
------------------------------
119+
This code was developed as a result of French-German project funded respectively by the ANR and DFG (HoTMiX project reference number ANR-19-CE09-0035-01): https://www.bam.de/Content/EN/Projects/HoTMiX/hotmix.html
120+
121+
### Maintainer(s)
122+
------------------------------
123+
* [Ravi PURUSHOTTAM](https://github.com/ravipurohit1991)

0 commit comments

Comments
 (0)