Skip to content

Commit 191fee8

Browse files
committed
refactor: Extract DataLoader from DiffStix - FINAL EXTRACTION
- Create DataLoader class in core/data_loader.py (183 lines) - Extract 3 data loading methods into focused class: - load_domain() (33 lines) - get_datastore_from_mitre_cti() (45 lines) - parse_extra_data() (80 lines) - Update DiffStix to delegate to DataLoader - Add lazy initialization for backward compatibility with tests - DiffStix reduced from 918 lines to 800 lines (12.9% reduction) - Total reduction from original: 676 lines (46.2% from 1,462 lines) - All 132/133 tests passing (only known permission test fails) REFACTORING COMPLETE: All 7 planned extractions finished! - MarkdownGenerator (145 lines) - LayerGenerator (80 lines) - JsonGenerator (34 lines) - StatisticsCollector (75 lines) - HierarchyBuilder (81 lines) - ChangeDetector (143 lines) - DataLoader (118 lines) Total: 676 lines extracted, 46.2% size reduction
1 parent ddadc53 commit 191fee8

2 files changed

Lines changed: 184 additions & 124 deletions

File tree

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
"""Data loader for ATT&CK STIX content."""
2+
3+
import os
4+
import sys
5+
6+
import requests
7+
import stix2
8+
from loguru import logger
9+
from requests.adapters import HTTPAdapter, Retry
10+
from stix2 import Filter, MemoryStore
11+
12+
from mitreattack import release_info
13+
from mitreattack.diffStix.utils.stix_utils import deep_copy_stix
14+
15+
16+
class DataLoader:
17+
"""Loads and parses ATT&CK STIX data from files or GitHub."""
18+
19+
def __init__(self, diff_stix_instance):
20+
"""Initialize DataLoader with a DiffStix instance.
21+
22+
Parameters
23+
----------
24+
diff_stix_instance : DiffStix
25+
The DiffStix instance containing data and helper methods
26+
"""
27+
self.diff_stix = diff_stix_instance
28+
29+
def load_domain(self, domain: str):
30+
"""Load data from directory according to domain.
31+
32+
Parameters
33+
----------
34+
domain : str
35+
An ATT&CK domain from the following list ["enterprise-attack", "mobile-attack", "ics-attack"]
36+
"""
37+
for datastore_version in ["old", "new"]:
38+
# only allow github.com/mitre/cti to be used for the old STIX domain
39+
if self.diff_stix.use_mitre_cti and datastore_version == "old":
40+
data_store = self.get_datastore_from_mitre_cti(domain=domain, datastore_version=datastore_version)
41+
else:
42+
directory = self.diff_stix.old if datastore_version == "old" else self.diff_stix.new
43+
if directory is None:
44+
raise ValueError(
45+
f"Directory path for {datastore_version} data cannot be None when not using MITRE CTI"
46+
)
47+
stix_file = os.path.join(directory, f"{domain}.json")
48+
49+
attack_version = release_info.get_attack_version(domain=domain, stix_file=stix_file)
50+
self.diff_stix.data[datastore_version][domain]["attack_release_version"] = attack_version
51+
52+
data_store = MemoryStore()
53+
data_store.load_from_file(stix_file)
54+
55+
self.diff_stix.data[datastore_version][domain]["stix_datastore"] = data_store
56+
self.parse_extra_data(data_store=data_store, domain=domain, datastore_version=datastore_version)
57+
58+
def get_datastore_from_mitre_cti(self, domain: str, datastore_version: str) -> stix2.MemoryStore:
59+
"""Load data from MITRE CTI repo according to domain.
60+
61+
Parameters
62+
----------
63+
domain : str
64+
An ATT&CK domain from the following list ["enterprise-attack", "mobile-attack", "ics-attack"]
65+
datastore_version : str
66+
The comparative version of the ATT&CK datastore. Choices are either "old" or "new".
67+
68+
Returns
69+
-------
70+
stix2.MemoryStore
71+
STIX MemoryStore object representing an ATT&CK domain.
72+
"""
73+
error_message = f"Unable to successfully download ATT&CK STIX data from GitHub for {domain}. Please try again."
74+
s = requests.Session()
75+
retries = Retry(total=10, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
76+
s.mount("http", HTTPAdapter(max_retries=retries))
77+
stix_url = f"https://raw.githubusercontent.com/mitre/cti/master/{domain}/{domain}.json"
78+
try:
79+
stix_response = s.get(stix_url, timeout=60)
80+
if stix_response.status_code != 200:
81+
logger.error(error_message)
82+
sys.exit(1)
83+
except (requests.exceptions.ContentDecodingError, requests.exceptions.JSONDecodeError):
84+
stix_response = s.get(stix_url, timeout=60)
85+
if stix_response.status_code != 200:
86+
logger.error(error_message)
87+
sys.exit(1)
88+
89+
stix_json = stix_response.json()
90+
attack_version = release_info.get_attack_version(domain=domain, stix_content=stix_response.content)
91+
self.diff_stix.data[datastore_version][domain]["attack_release_version"] = attack_version
92+
93+
data_store = MemoryStore(stix_data=stix_json["objects"])
94+
return data_store
95+
96+
def parse_extra_data(self, data_store: stix2.MemoryStore, domain: str, datastore_version: str):
97+
"""Parse STIX datastore objects and relationships.
98+
99+
Parameters
100+
----------
101+
data_store : stix2.MemoryStore
102+
STIX MemoryStore object representing an ATT&CK domain.
103+
domain : str
104+
An ATT&CK domain from the following list ["enterprise-attack", "mobile-attack", "ics-attack"]
105+
datastore_version : str
106+
The comparative version of the ATT&CK datastore. Choices are either "old" or "new".
107+
"""
108+
attack_type_to_stix_filter = {
109+
"techniques": [Filter("type", "=", "attack-pattern")],
110+
"software": [Filter("type", "=", "malware"), Filter("type", "=", "tool")],
111+
"groups": [Filter("type", "=", "intrusion-set")],
112+
"campaigns": [Filter("type", "=", "campaign")],
113+
"assets": [Filter("type", "=", "x-mitre-asset")],
114+
"mitigations": [Filter("type", "=", "course-of-action")],
115+
"datasources": [Filter("type", "=", "x-mitre-data-source")],
116+
"datacomponents": [Filter("type", "=", "x-mitre-data-component")],
117+
"detectionstrategies": [Filter("type", "=", "x-mitre-detection-strategy")],
118+
"analytics": [Filter("type", "=", "x-mitre-analytic")],
119+
}
120+
for object_type, stix_filters in attack_type_to_stix_filter.items():
121+
raw_data = []
122+
for stix_filter in stix_filters:
123+
temp_filtered_list = data_store.query(stix_filter)
124+
raw_data.extend(temp_filtered_list)
125+
126+
raw_data = deep_copy_stix(raw_data)
127+
self.diff_stix.data[datastore_version][domain]["attack_objects"][object_type] = {
128+
attack_object["id"]: attack_object for attack_object in raw_data
129+
}
130+
131+
subtechnique_relationships = data_store.query(
132+
[
133+
Filter("type", "=", "relationship"),
134+
Filter("relationship_type", "=", "subtechnique-of"),
135+
]
136+
)
137+
self.diff_stix.data[datastore_version][domain]["relationships"]["subtechniques"] = {
138+
relationship["id"]: relationship for relationship in subtechnique_relationships
139+
}
140+
141+
revoked_by_relationships = data_store.query(
142+
[
143+
Filter("type", "=", "relationship"),
144+
Filter("relationship_type", "=", "revoked-by"),
145+
]
146+
)
147+
148+
# use list in case STIX object was revoked more than once
149+
for relationship in revoked_by_relationships:
150+
source_id = relationship["source_ref"]
151+
if source_id not in self.diff_stix.data[datastore_version][domain]["relationships"]["revoked-by"]:
152+
self.diff_stix.data[datastore_version][domain]["relationships"]["revoked-by"][source_id] = []
153+
self.diff_stix.data[datastore_version][domain]["relationships"]["revoked-by"][source_id].append(
154+
relationship
155+
)
156+
157+
mitigating_relationships = data_store.query(
158+
[
159+
Filter("type", "=", "relationship"),
160+
Filter("relationship_type", "=", "mitigates"),
161+
]
162+
)
163+
self.diff_stix.data[datastore_version][domain]["relationships"]["mitigations"] = {
164+
relationship["id"]: relationship for relationship in mitigating_relationships
165+
}
166+
167+
detection_relationships = data_store.query(
168+
[
169+
Filter("type", "=", "relationship"),
170+
Filter("relationship_type", "=", "detects"),
171+
]
172+
)
173+
self.diff_stix.data[datastore_version][domain]["relationships"]["detections"] = {
174+
relationship["id"]: relationship for relationship in detection_relationships
175+
}

mitreattack/diffStix/core/diff_stix.py

Lines changed: 9 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from mitreattack.diffStix.core.change_detector import ChangeDetector
1515
from mitreattack.diffStix.core.contributor_tracker import ContributorTracker
16+
from mitreattack.diffStix.core.data_loader import DataLoader
1617
from mitreattack.diffStix.core.domain_statistics import DomainStatistics
1718
from mitreattack.diffStix.core.hierarchy_builder import HierarchyBuilder
1819
from mitreattack.diffStix.core.statistics_collector import StatisticsCollector
@@ -198,7 +199,8 @@ def __init__(
198199
for _type in self.types:
199200
self.data[datastore_version][domain]["attack_objects"][_type] = {}
200201

201-
# Initialize change detector before data loading
202+
# Initialize data loader and change detector before data loading
203+
self._data_loader = DataLoader(self)
202204
self._change_detector = ChangeDetector(self)
203205

204206
self.load_data()
@@ -579,31 +581,7 @@ def load_domain(self, domain: str):
579581
domain : str
580582
An ATT&CK domain from the following list ["enterprise-attack", "mobile-attack", "ics-attack"]
581583
"""
582-
# Import here to avoid circular dependency
583-
import os
584-
585-
from mitreattack import release_info
586-
587-
for datastore_version in ["old", "new"]:
588-
# only allow github.com/mitre/cti to be used for the old STIX domain
589-
if self.use_mitre_cti and datastore_version == "old":
590-
data_store = self.get_datastore_from_mitre_cti(domain=domain, datastore_version=datastore_version)
591-
else:
592-
directory = self.old if datastore_version == "old" else self.new
593-
if directory is None:
594-
raise ValueError(
595-
f"Directory path for {datastore_version} data cannot be None when not using MITRE CTI"
596-
)
597-
stix_file = os.path.join(directory, f"{domain}.json")
598-
599-
attack_version = release_info.get_attack_version(domain=domain, stix_file=stix_file)
600-
self.data[datastore_version][domain]["attack_release_version"] = attack_version
601-
602-
data_store = MemoryStore()
603-
data_store.load_from_file(stix_file)
604-
605-
self.data[datastore_version][domain]["stix_datastore"] = data_store
606-
self.parse_extra_data(data_store=data_store, domain=domain, datastore_version=datastore_version)
584+
return self._data_loader.load_domain(domain)
607585

608586
def get_datastore_from_mitre_cti(self, domain: str, datastore_version: str) -> stix2.MemoryStore:
609587
"""Load data from MITRE CTI repo according to domain.
@@ -620,36 +598,10 @@ def get_datastore_from_mitre_cti(self, domain: str, datastore_version: str) -> s
620598
stix2.MemoryStore
621599
STIX MemoryStore object representing an ATT&CK domain.
622600
"""
623-
# Import here to avoid circular dependency
624-
import sys
625-
626-
import requests
627-
from requests.adapters import HTTPAdapter, Retry
628-
629-
from mitreattack import release_info
630-
631-
error_message = f"Unable to successfully download ATT&CK STIX data from GitHub for {domain}. Please try again."
632-
s = requests.Session()
633-
retries = Retry(total=10, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
634-
s.mount("http", HTTPAdapter(max_retries=retries))
635-
stix_url = f"https://raw.githubusercontent.com/mitre/cti/master/{domain}/{domain}.json"
636-
try:
637-
stix_response = s.get(stix_url, timeout=60)
638-
if stix_response.status_code != 200:
639-
logger.error(error_message)
640-
sys.exit(1)
641-
except (requests.exceptions.ContentDecodingError, requests.exceptions.JSONDecodeError):
642-
stix_response = s.get(stix_url, timeout=60)
643-
if stix_response.status_code != 200:
644-
logger.error(error_message)
645-
sys.exit(1)
646-
647-
stix_json = stix_response.json()
648-
attack_version = release_info.get_attack_version(domain=domain, stix_content=stix_response.content)
649-
self.data[datastore_version][domain]["attack_release_version"] = attack_version
650-
651-
data_store = MemoryStore(stix_data=stix_json["objects"])
652-
return data_store
601+
# Lazy initialization for backward compatibility with tests
602+
if not hasattr(self, "_data_loader"):
603+
self._data_loader = DataLoader(self)
604+
return self._data_loader.get_datastore_from_mitre_cti(domain, datastore_version)
653605

654606
def parse_extra_data(self, data_store: stix2.MemoryStore, domain: str, datastore_version: str):
655607
"""Parse STIX datastore objects and relationships.
@@ -663,74 +615,7 @@ def parse_extra_data(self, data_store: stix2.MemoryStore, domain: str, datastore
663615
datastore_version : str
664616
The comparative version of the ATT&CK datastore. Choices are either "old" or "new".
665617
"""
666-
# Import here to avoid circular dependency
667-
668-
attack_type_to_stix_filter = {
669-
"techniques": [Filter("type", "=", "attack-pattern")],
670-
"software": [Filter("type", "=", "malware"), Filter("type", "=", "tool")],
671-
"groups": [Filter("type", "=", "intrusion-set")],
672-
"campaigns": [Filter("type", "=", "campaign")],
673-
"assets": [Filter("type", "=", "x-mitre-asset")],
674-
"mitigations": [Filter("type", "=", "course-of-action")],
675-
"datasources": [Filter("type", "=", "x-mitre-data-source")],
676-
"datacomponents": [Filter("type", "=", "x-mitre-data-component")],
677-
"detectionstrategies": [Filter("type", "=", "x-mitre-detection-strategy")],
678-
"analytics": [Filter("type", "=", "x-mitre-analytic")],
679-
}
680-
for object_type, stix_filters in attack_type_to_stix_filter.items():
681-
raw_data = []
682-
for stix_filter in stix_filters:
683-
temp_filtered_list = data_store.query(stix_filter)
684-
raw_data.extend(temp_filtered_list)
685-
686-
raw_data = deep_copy_stix(raw_data)
687-
self.data[datastore_version][domain]["attack_objects"][object_type] = {
688-
attack_object["id"]: attack_object for attack_object in raw_data
689-
}
690-
691-
subtechnique_relationships = data_store.query(
692-
[
693-
Filter("type", "=", "relationship"),
694-
Filter("relationship_type", "=", "subtechnique-of"),
695-
]
696-
)
697-
self.data[datastore_version][domain]["relationships"]["subtechniques"] = {
698-
relationship["id"]: relationship for relationship in subtechnique_relationships
699-
}
700-
701-
revoked_by_relationships = data_store.query(
702-
[
703-
Filter("type", "=", "relationship"),
704-
Filter("relationship_type", "=", "revoked-by"),
705-
]
706-
)
707-
708-
# use list in case STIX object was revoked more than once
709-
for relationship in revoked_by_relationships:
710-
source_id = relationship["source_ref"]
711-
if source_id not in self.data[datastore_version][domain]["relationships"]["revoked-by"]:
712-
self.data[datastore_version][domain]["relationships"]["revoked-by"][source_id] = []
713-
self.data[datastore_version][domain]["relationships"]["revoked-by"][source_id].append(relationship)
714-
715-
mitigating_relationships = data_store.query(
716-
[
717-
Filter("type", "=", "relationship"),
718-
Filter("relationship_type", "=", "mitigates"),
719-
]
720-
)
721-
self.data[datastore_version][domain]["relationships"]["mitigations"] = {
722-
relationship["id"]: relationship for relationship in mitigating_relationships
723-
}
724-
725-
detection_relationships = data_store.query(
726-
[
727-
Filter("type", "=", "relationship"),
728-
Filter("relationship_type", "=", "detects"),
729-
]
730-
)
731-
self.data[datastore_version][domain]["relationships"]["detections"] = {
732-
relationship["id"]: relationship for relationship in detection_relationships
733-
}
618+
return self._data_loader.parse_extra_data(data_store, domain, datastore_version)
734619

735620
def update_contributors(self, old_object: Optional[dict], new_object: dict):
736621
"""Update contributors list if new object has contributors.

0 commit comments

Comments
 (0)