|
62 | 62 | from scanpipe.pipes import purldb |
63 | 63 | from scanpipe.pipes import resolve |
64 | 64 | from scanpipe.pipes import scancode |
| 65 | +from scanpipe.pipes import strings |
65 | 66 | from scanpipe.pipes import symbolmap |
66 | 67 | from scanpipe.pipes import symbols |
67 | 68 |
|
@@ -2146,3 +2147,84 @@ def _map_javascript_symbols(to_resource, javascript_from_resources, logger): |
2146 | 2147 | to_resource.update(status=flag.MAPPED) |
2147 | 2148 | return 1 |
2148 | 2149 | return 0 |
| 2150 | + |
| 2151 | + |
| 2152 | +def map_javascript_strings(project, logger=None): |
| 2153 | + """Map deployed JavaScript, TypeScript to its sources using string literals.""" |
| 2154 | + project_files = project.codebaseresources.files() |
| 2155 | + |
| 2156 | + javascript_to_resources = ( |
| 2157 | + project_files.to_codebase() |
| 2158 | + .has_no_relation() |
| 2159 | + .filter(extension__in=[".ts", ".js"]) |
| 2160 | + .exclude(extra_data={}) |
| 2161 | + ) |
| 2162 | + |
| 2163 | + javascript_from_resources = ( |
| 2164 | + project_files.from_codebase() |
| 2165 | + .exclude(path__contains="/test/") |
| 2166 | + .filter(extension__in=[".ts", ".js"]) |
| 2167 | + .exclude(extra_data={}) |
| 2168 | + ) |
| 2169 | + |
| 2170 | + if not (javascript_from_resources.exists() and javascript_to_resources.exists()): |
| 2171 | + return |
| 2172 | + |
| 2173 | + javascript_from_resources_count = javascript_from_resources.count() |
| 2174 | + javascript_to_resources_count = javascript_to_resources.count() |
| 2175 | + if logger: |
| 2176 | + logger( |
| 2177 | + f"Mapping {javascript_to_resources_count:,d} JavaScript resources" |
| 2178 | + f" using string literals against {javascript_from_resources_count:,d}" |
| 2179 | + " from/ resources." |
| 2180 | + ) |
| 2181 | + |
| 2182 | + resource_iterator = javascript_to_resources.iterator(chunk_size=2000) |
| 2183 | + progress = LoopProgress(javascript_to_resources_count, logger) |
| 2184 | + |
| 2185 | + resource_mapped = 0 |
| 2186 | + for to_resource in progress.iter(resource_iterator): |
| 2187 | + resource_mapped += _map_javascript_strings( |
| 2188 | + to_resource, javascript_from_resources, logger |
| 2189 | + ) |
| 2190 | + if logger: |
| 2191 | + logger(f"{resource_mapped:,d} resource mapped using strings") |
| 2192 | + |
| 2193 | + |
| 2194 | +def _map_javascript_strings(to_resource, javascript_from_resources, logger): |
| 2195 | + """ |
| 2196 | + Map a deployed JavaScript resource to its source using string literals and |
| 2197 | + return 1 if match is found otherwise return 0. |
| 2198 | + """ |
| 2199 | + ignoreable_string_threshold = 5 |
| 2200 | + to_strings = to_resource.extra_data.get("source_strings") |
| 2201 | + |
| 2202 | + if not to_strings and len(to_strings) > ignoreable_string_threshold: |
| 2203 | + return 0 |
| 2204 | + |
| 2205 | + best_matching_score = 0 |
| 2206 | + best_match = None |
| 2207 | + for source_js in javascript_from_resources: |
| 2208 | + from_strings = source_js.extra_data.get("source_strings") |
| 2209 | + if not from_strings and len(from_strings) > ignoreable_string_threshold: |
| 2210 | + continue |
| 2211 | + |
| 2212 | + is_match, similarity = strings.match_source_strings_to_deployed( |
| 2213 | + source_strings=from_strings, |
| 2214 | + deployed_strings=to_strings, |
| 2215 | + ) |
| 2216 | + |
| 2217 | + if is_match and similarity > best_matching_score: |
| 2218 | + best_matching_score = similarity |
| 2219 | + best_match = source_js |
| 2220 | + |
| 2221 | + if best_match: |
| 2222 | + pipes.make_relation( |
| 2223 | + from_resource=best_match, |
| 2224 | + to_resource=to_resource, |
| 2225 | + map_type="javascript_strings", |
| 2226 | + extra_data={"js_string_map_score": similarity}, |
| 2227 | + ) |
| 2228 | + to_resource.update(status=flag.MAPPED) |
| 2229 | + return 1 |
| 2230 | + return 0 |
0 commit comments