Skip to content

Commit 5b6049f

Browse files
authored
Merge pull request #15 from DominicTWHV/dev
Merge a few experimental/WIP code into main
2 parents 77a4d76 + 0417fff commit 5b6049f

14 files changed

Lines changed: 135 additions & 30 deletions

edge/database/driver.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from edge.logger.context import db_logger
99

10-
from edge.registry.database import PrimaryDBConf, SecurityDBConf
10+
from edge.registry.database import PrimaryDBConf, SecurityDBConf, MetadataDBConf
1111

1212
class DBInitManager:
1313
@staticmethod
@@ -33,6 +33,7 @@ async def _init_sqlite(db_type: Literal["primary", "security_core", "security_ge
3333
"primary": (PrimaryDBConf.sqlite_schema_path),
3434
"security_core": (SecurityDBConf.sqlite_schema_path_core),
3535
"security_generic": (SecurityDBConf.sqlite_schema_path_generic),
36+
"metadata": (MetadataDBConf.sqlite_schema_path),
3637
}
3738

3839
schema_path = config_map[db_type]
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
CREATE TABLE IF NOT EXISTS `dataset_metadata` (
2+
`repository` VARCHAR, -- github repository, ex: DominicTWHV/CTI-DB-DUMP | If using a txt dataset, the url will be stored if the dataset isn't on github.
3+
`licensing` VARCHAR, -- dataset licensing
4+
`dataset_type` VARCHAR, -- cockatoo_core, text (like traditional url blocklists)
5+
`remote_update_interval` INT, -- in hours, how often the remote updates
6+
7+
`num_of_url_entries` INT,
8+
`num_of_file_entries` INT,
9+
`num_of_invite_entries` INT,
10+
11+
`enabled` INT DEFAULT 0, -- whether this dataset is enabled (1 = true, 0 = false)
12+
13+
`last_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
14+
);

edge/database/schema/primary.sql

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,6 @@ CREATE TABLE IF NOT EXISTS `configs` (
99
`last_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
1010
);
1111

12-
CREATE TABLE IF NOT EXISTS `datasets` (
13-
`security_repository` VARCHAR PRIMARY KEY, -- github repository for security dataset, ex: DominicTWHV/CTI-DB-DUMP
14-
15-
`in_use` INT DEFAULT 0, -- whether this dataset is actively being used (1 = true, 0 = false)
16-
`locked` INT DEFAULT 0, -- whether this dataset is locked from deletion (manually)
17-
18-
`remote_update_interval` INT DEFAULT 0, -- how often in minutes is this dataset updates by the remote peer (set automatically, do not manually edit)
19-
20-
`last_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
21-
);
22-
2312
CREATE TABLE IF NOT EXISTS `actions` (
2413
`server_id` VARCHAR PRIMARY KEY, -- server identifier
2514

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,3 @@
1-
CREATE TABLE IF NOT EXISTS `dataset_metadata` (
2-
`repository` VARCHAR, -- github repository, ex: DominicTWHV/CTI-DB-DUMP | If using a txt dataset, the url will be stored if the dataset isn't on github.
3-
`licensing` VARCHAR, -- dataset licensing
4-
`dataset_type` VARCHAR, -- cockatoo_core, text (like traditional url blocklists)
5-
`remote_update_interval` INT, -- in hours, how often the remote updates
6-
7-
`num_of_url_entries` INT,
8-
`num_of_file_entries` INT,
9-
`num_of_invite_entries` INT,
10-
11-
`last_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
12-
);
13-
141
CREATE TABLE IF NOT EXISTS `url_cache` (
152
`id` INTEGER PRIMARY KEY AUTOINCREMENT,
163

@@ -19,18 +6,24 @@ CREATE TABLE IF NOT EXISTS `url_cache` (
196
`url_hash` VARCHAR(64) NOT NULL,
207

218
`times_seen` INT DEFAULT 1, -- how many times this url has been seen by core
9+
10+
`mother_set_name` VARCHAR, -- the dataset this url originated from
2211

2312
`last_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP -- auto timestamp for last seen time
2413
);
2514

2615
CREATE TABLE IF NOT EXISTS `file_hash_cache` (
2716
`sha256` VARCHAR PRIMARY KEY,
2817

18+
`mother_set_name` VARCHAR, -- the dataset this url originated from
19+
2920
`last_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP -- auto timestamp for last seen time
3021
);
3122

3223
CREATE TABLE IF NOT EXISTS `invite_cache` (
3324
`invite` VARCHAR PRIMARY KEY, -- will be a sha256 hash
3425

26+
`mother_set_name` VARCHAR, -- the dataset this url originated from
27+
3528
`last_update` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP -- auto timestamp for last seen time
3629
);

edge/database/sqlite/metadata.db

8 KB
Binary file not shown.
32 KB
Binary file not shown.
28 KB
Binary file not shown.

edge/helper/DBFunctions.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from edge.database.driver import SQLiteDriver, DBInitManager
66

7-
from edge.registry.database import PrimaryDBConf, SecurityDBConf
7+
from edge.registry.database import PrimaryDBConf, SecurityDBConf, MetadataDBConf
88

99
class AutoDBMgr:
1010

@@ -13,5 +13,9 @@ async def init_db() -> None: #initializes static db instances. Not used for data
1313

1414
for db_path, db_type in [
1515
(PrimaryDBConf.sqlite_db_path, "primary"),
16+
(SecurityDBConf.sqlite_db_path_core, "security_core"),
17+
(SecurityDBConf.sqlite_db_path_generic, "security_generic"),
18+
(MetadataDBConf.sqlite_db_path, "metadata"),
19+
1620
]:
1721
await DBInitManager.init_db(db_type=db_type, db_path=db_path)

edge/helper/downloadMgr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class DownloadManager:
3939

4040
@staticmethod
4141
async def download_file(url: str, dos_protection: bool = SessionConfigs.dos_protection, max_size: int = SessionConfigs.max_file_size) -> str:
42-
session = SessionFactory().grab_session()
42+
session = await SessionFactory().grab_session()
4343
async with session.get(url) as response:
4444

4545
if SessionConfigs.raise_for_status:

edge/helper/setOrchestrator.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import magic # file type identification\
2+
import tldextract #domain extraction
3+
4+
from edge.helper.downloadMgr import URLParser, DownloadManager
5+
from edge.helper.aiohttpSessionFactory import SessionFactory
6+
7+
from edge.logger.context import networking_logger
8+
9+
from edge.registry.networking import DatasetDownloadConfigs, FileSourceIdentConfigs
10+
11+
class Ident:
12+
13+
@staticmethod
14+
async def remote_file_type(url: str) -> str:
15+
session = await SessionFactory().grab_session()
16+
async with session.get(url) as response:
17+
mime_header = response.headers.get('Content-Type', '').lower()
18+
19+
chunk = await response.content.read(DatasetDownloadConfigs.read_chunk_size) #read a certain amount of bytes for identification
20+
21+
detected_type = magic.from_buffer(chunk, mime=True) #json, text/plain, etc.
22+
23+
networking_logger.debug(f"File Type Identification: Detected type: {detected_type}, Mime Header: {mime_header}")
24+
25+
return detected_type, mime_header
26+
27+
@staticmethod
28+
async def file_source(url: str) -> str:
29+
extracted = tldextract.extract(url)
30+
31+
domain = extracted.registered_domain.lower()
32+
33+
if domain in FileSourceIdentConfigs.github_domains:
34+
networking_logger.debug(f"URL {url} identified as GitHub domain: {domain}")
35+
return "github"
36+
37+
networking_logger.debug(f"URL {url} source is non-GitHub domain: {domain}")
38+
return "unknown"
39+
40+
class Verify:
41+
42+
@staticmethod
43+
async def file_type_allowed(detected_type: str) -> bool:
44+
if not DatasetDownloadConfigs.validate_file_type:
45+
networking_logger.debug(f"File Type Verification: Validation disabled. Bypassing checks. File type: {detected_type}")
46+
return True
47+
48+
if detected_type in DatasetDownloadConfigs.allowed_mime_types:
49+
networking_logger.info(f"File Type Verification: Detected type {detected_type} is allowed.")
50+
return True
51+
52+
else:
53+
networking_logger.warning(f"File Type Verification: Detected type {detected_type} is NOT allowed.")
54+
return False
55+
56+
class DSDownload:
57+
58+
@staticmethod
59+
async def pipeline(url: str) -> str:
60+
detected_type, mime_header = await Ident.remote_file_type(url)
61+
source = await Ident.file_source(url)
62+
63+
if not await Verify.file_type_allowed(detected_type):
64+
networking_logger.error(f"Dataset Download Pipeline: File type {detected_type} not allowed. Aborting download for safety. URL: {url}")
65+
return
66+
67+
if source == "github": #parse out the raw github content url (raw.githubusercontent.com links will NOT be identified as github)
68+
raw_url = await URLParser.parse_github_url(url)
69+
url = raw_url.append("metadata.json") #append the standard dataset filename for Cockatoo Core dataset format
70+
71+
content = await DownloadManager.download_file(url) #download the mnetadata file into a variable
72+
73+
else:
74+
content = await DownloadManager.download_file(url) # -> this is the set as not using cockatoo core dataset format

0 commit comments

Comments
 (0)