Skip to content

Commit bf960d7

Browse files
feat: add HTML support to wrapper_all_parsersUpdate wrapper_all_parsers.py
Integrate HTML parser into the main parser wrapper: - Import convert_html_to_instruments from html_parser - Add support for FileType.html and FileType.htm - Update _get_instruments_from_file with HTML routing logic - Add documentation for the function This enables the load_instruments_from_local_file function to automatically detect and process HTML files using the new HTML parser.
1 parent e12a92c commit bf960d7

1 file changed

Lines changed: 12 additions & 9 deletions

File tree

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,61 @@
11
'''
22
MIT License
3-
43
Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
54
Project: Harmony (https://harmonydata.ac.uk)
65
Maintainer: Thomas Wood (https://fastdatascience.com)
7-
86
Permission is hereby granted, free of charge, to any person obtaining a copy
97
of this software and associated documentation files (the "Software"), to deal
108
in the Software without restriction, including without limitation the rights
119
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1210
copies of the Software, and to permit persons to whom the Software is
1311
furnished to do so, subject to the following conditions:
14-
1512
The above copyright notice and this permission notice shall be included in all
1613
copies or substantial portions of the Software.
17-
1814
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1915
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2016
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2117
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2218
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2319
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2420
SOFTWARE.
25-
2621
'''
2722

2823
from typing import List
29-
3024
from harmony.parsing.excel_parser import convert_excel_to_instruments
3125
from harmony.parsing.pdf_parser import convert_pdf_to_instruments
3226
from harmony.parsing.text_parser import convert_text_to_instruments
27+
from harmony.parsing.html_parser import convert_html_to_instruments
3328
from harmony.schemas.enums.file_types import FileType
3429
from harmony.schemas.requests.text import RawFile, Instrument
3530

3631

3732
def _get_instruments_from_file(file):
33+
"""
34+
Route files to appropriate parsers based on file type.
35+
36+
Args:
37+
file: RawFile object containing file content and metadata
38+
39+
Returns:
40+
List[Instrument]: Parsed instruments from the file
41+
"""
3842
if file.file_type == FileType.pdf or file.file_type == FileType.docx:
3943
instruments_from_this_file = convert_pdf_to_instruments(file)
4044
elif file.file_type == FileType.txt or file.file_type == FileType.csv:
4145
instruments_from_this_file = convert_text_to_instruments(file)
4246
elif file.file_type == FileType.xlsx:
4347
instruments_from_this_file = convert_excel_to_instruments(file)
48+
elif file.file_type == FileType.html or file.file_type == FileType.htm:
49+
instruments_from_this_file = convert_html_to_instruments(file)
4450
else:
4551
instruments_from_this_file = []
4652
return instruments_from_this_file
4753

4854

4955
def convert_files_to_instruments(files: List[RawFile]) -> List[Instrument]:
5056
"""Convert files to instruments"""
51-
5257
instruments = []
53-
5458
for file in files:
5559
instruments_from_this_file = _get_instruments_from_file(file)
5660
instruments.extend(instruments_from_this_file)
57-
5861
return instruments

0 commit comments

Comments
 (0)