|
1 | 1 | ''' |
2 | 2 | MIT License |
3 | | -
|
4 | 3 | Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk). |
5 | 4 | Project: Harmony (https://harmonydata.ac.uk) |
6 | 5 | Maintainer: Thomas Wood (https://fastdatascience.com) |
7 | | -
|
8 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | 7 | of this software and associated documentation files (the "Software"), to deal |
10 | 8 | in the Software without restriction, including without limitation the rights |
11 | 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | 10 | copies of the Software, and to permit persons to whom the Software is |
13 | 11 | furnished to do so, subject to the following conditions: |
14 | | -
|
15 | 12 | The above copyright notice and this permission notice shall be included in all |
16 | 13 | copies or substantial portions of the Software. |
17 | | -
|
18 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
21 | 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
23 | 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
24 | 20 | SOFTWARE. |
25 | | -
|
26 | 21 | ''' |
27 | 22 |
|
28 | 23 | from typing import List |
29 | | - |
30 | 24 | from harmony.parsing.excel_parser import convert_excel_to_instruments |
31 | 25 | from harmony.parsing.pdf_parser import convert_pdf_to_instruments |
32 | 26 | from harmony.parsing.text_parser import convert_text_to_instruments |
| 27 | +from harmony.parsing.html_parser import convert_html_to_instruments |
33 | 28 | from harmony.schemas.enums.file_types import FileType |
34 | 29 | from harmony.schemas.requests.text import RawFile, Instrument |
35 | 30 |
|
36 | 31 |
|
37 | 32 | def _get_instruments_from_file(file): |
| 33 | + """ |
| 34 | + Route files to appropriate parsers based on file type. |
| 35 | + |
| 36 | + Args: |
| 37 | + file: RawFile object containing file content and metadata |
| 38 | + |
| 39 | + Returns: |
| 40 | + List[Instrument]: Parsed instruments from the file |
| 41 | + """ |
38 | 42 | if file.file_type == FileType.pdf or file.file_type == FileType.docx: |
39 | 43 | instruments_from_this_file = convert_pdf_to_instruments(file) |
40 | 44 | elif file.file_type == FileType.txt or file.file_type == FileType.csv: |
41 | 45 | instruments_from_this_file = convert_text_to_instruments(file) |
42 | 46 | elif file.file_type == FileType.xlsx: |
43 | 47 | instruments_from_this_file = convert_excel_to_instruments(file) |
| 48 | + elif file.file_type == FileType.html or file.file_type == FileType.htm: |
| 49 | + instruments_from_this_file = convert_html_to_instruments(file) |
44 | 50 | else: |
45 | 51 | instruments_from_this_file = [] |
46 | 52 | return instruments_from_this_file |
47 | 53 |
|
48 | 54 |
|
49 | 55 | def convert_files_to_instruments(files: List[RawFile]) -> List[Instrument]: |
50 | 56 | """Convert files to instruments""" |
51 | | - |
52 | 57 | instruments = [] |
53 | | - |
54 | 58 | for file in files: |
55 | 59 | instruments_from_this_file = _get_instruments_from_file(file) |
56 | 60 | instruments.extend(instruments_from_this_file) |
57 | | - |
58 | 61 | return instruments |
0 commit comments