Skip to content

Commit 7563e50

Browse files
authored
feat: add documentation to module fields in the reflect API. (VirusTotal#618)
The `yara_x::mods::reflect` API now exports documentation strings for both module field names and functions.
1 parent 85ecbcc commit 7563e50

13 files changed

Lines changed: 370 additions & 70 deletions

File tree

lib/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,14 @@ generate-proto-code = [
5858
"dep:yara-x-proto"
5959
]
6060

61+
# Enables the generation of documentation for module fields and functions. This
62+
# requires the `protoc` feature because it relies on `protoc`'s ability to
63+
# extract documentation comments from .proto files, something that the pure-Rust
64+
# parser in the `protobuf_codegen` crate can't do.
65+
#
66+
# This feature is disabled by default.
67+
generate-module-docs = ["protoc"]
68+
6169
# Uses the `inventory` crate (https://github.com/dtolnay/inventory) instead
6270
# of `linkme` (https://github.com/dtolnay/linkme) for tracking WASM exports.
6371
#

lib/build.rs

Lines changed: 221 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,17 @@
22
use protobuf::descriptor::FileDescriptorProto;
33

44
#[cfg(feature = "generate-proto-code")]
5-
fn generate_module_files(proto_files: Vec<FileDescriptorProto>) {
5+
#[derive(Clone, Ord, PartialOrd, Eq, PartialEq)]
6+
struct Module {
7+
name: String,
8+
proto_mod: String,
9+
rust_mod: Option<String>,
10+
cargo_feature: Option<String>,
11+
root_msg: String,
12+
}
13+
14+
#[cfg(feature = "generate-proto-code")]
15+
fn generate_module_files(proto_files: &[FileDescriptorProto]) -> Vec<Module> {
616
use std::fs::File;
717
use std::io::Write;
818
use std::path::PathBuf;
@@ -12,6 +22,7 @@ fn generate_module_files(proto_files: Vec<FileDescriptorProto>) {
1222
println!("cargo:rerun-if-changed=src/modules/modules.rs");
1323

1424
let mut modules = Vec::new();
25+
1526
// Look for .proto files that describe a YARA module. A proto that
1627
// describes a YARA module has yara.module_options, like...
1728
//
@@ -25,7 +36,7 @@ fn generate_module_files(proto_files: Vec<FileDescriptorProto>) {
2536
if let Some(module_options) =
2637
yara_module_options.get(&proto_file.options)
2738
{
28-
let proto_path = PathBuf::from(proto_file.name.unwrap());
39+
let proto_path = PathBuf::from(proto_file.name.as_ref().unwrap());
2940
let proto_name = proto_path
3041
.with_extension("")
3142
.file_name()
@@ -34,13 +45,15 @@ fn generate_module_files(proto_files: Vec<FileDescriptorProto>) {
3445
.unwrap()
3546
.to_string();
3647

37-
modules.push((
38-
module_options.name.unwrap(),
39-
proto_name,
40-
module_options.rust_module,
41-
module_options.cargo_feature,
42-
module_options.root_message.unwrap(),
43-
));
48+
let root_msg = module_options.root_message.unwrap();
49+
50+
modules.push(Module {
51+
name: module_options.name.unwrap(),
52+
proto_mod: proto_name,
53+
rust_mod: module_options.rust_module,
54+
cargo_feature: module_options.cargo_feature,
55+
root_msg,
56+
});
4457
}
4558
}
4659

@@ -64,7 +77,7 @@ fn generate_module_files(proto_files: Vec<FileDescriptorProto>) {
6477
println!(
6578
"cargo:warning=to disable the warning set the environment variable YRX_REGENERATE_MODULES_RS=false"
6679
);
67-
return;
80+
return Vec::new();
6881
}
6982
};
7083

@@ -95,14 +108,14 @@ fn generate_module_files(proto_files: Vec<FileDescriptorProto>) {
95108
// no matter the platform. If modules are not sorted, the order will
96109
// vary from one platform to the other, in the same way that HashMap
97110
// doesn't produce consistent key order.
98-
modules.sort();
111+
modules.sort_by(|a, b| a.name.cmp(&b.name));
99112

100-
for m in modules {
101-
let name = m.0;
102-
let proto_mod = m.1;
103-
let rust_mod = m.2;
104-
let cargo_feature = m.3;
105-
let root_message = m.4;
113+
for m in &modules {
114+
let name = &m.name;
115+
let proto_mod = &m.proto_mod;
116+
let rust_mod = &m.rust_mod;
117+
let cargo_feature = &m.cargo_feature;
118+
let root_message = &m.root_msg;
106119

107120
// If the YARA module has an associated Rust module, this module must
108121
// have a function named "main". If the YARA module doesn't have an
@@ -145,6 +158,187 @@ add_module!(modules, "{name}", {proto_mod}, "{root_message}", {rust_mod_name}, {
145158
}
146159

147160
write!(add_modules_rs, "\n}}").unwrap();
161+
162+
modules
163+
}
164+
165+
#[cfg(feature = "generate-module-docs")]
166+
fn generate_module_docs(
167+
proto_files: &[FileDescriptorProto],
168+
modules: &[Module],
169+
) {
170+
use std::collections::{HashMap, HashSet};
171+
use std::fs::File;
172+
use std::io::Write;
173+
174+
// 1. Collect message dependencies
175+
let mut dependencies = HashMap::new();
176+
177+
for proto_file in proto_files {
178+
let package = proto_file.package.as_deref().unwrap_or("");
179+
180+
fn collect_deps(
181+
msg: &protobuf::descriptor::DescriptorProto,
182+
full_name: String,
183+
deps: &mut HashMap<String, Vec<String>>,
184+
) {
185+
let mut referenced = Vec::new();
186+
for field in &msg.field {
187+
if field.type_()
188+
== protobuf::descriptor::field_descriptor_proto::Type::TYPE_MESSAGE
189+
{
190+
if let Some(type_name) = &field.type_name {
191+
let dep_name = type_name
192+
.strip_prefix('.')
193+
.unwrap_or(type_name)
194+
.to_string();
195+
referenced.push(dep_name);
196+
}
197+
}
198+
}
199+
200+
for nested in &msg.nested_type {
201+
let nested_name = format!(
202+
"{}.{}",
203+
full_name,
204+
nested.name.as_deref().unwrap_or("")
205+
);
206+
collect_deps(nested, nested_name, deps);
207+
}
208+
209+
deps.insert(full_name, referenced);
210+
}
211+
212+
for msg in &proto_file.message_type {
213+
let msg_name = msg.name.as_deref().unwrap_or("");
214+
let full_name = if package.is_empty() {
215+
msg_name.to_string()
216+
} else {
217+
format!("{}.{}", package, msg_name)
218+
};
219+
collect_deps(msg, full_name, &mut dependencies);
220+
}
221+
}
222+
223+
// 2. Compute transitive closure
224+
let mut reachable = HashSet::new();
225+
let mut queue: Vec<String> = Vec::new();
226+
227+
for m in modules {
228+
let root = &m.root_msg;
229+
if reachable.insert(root.clone()) {
230+
queue.push(root.clone());
231+
}
232+
}
233+
234+
while let Some(node) = queue.pop() {
235+
if let Some(deps) = dependencies.get(&node) {
236+
for dep in deps {
237+
if reachable.insert(dep.clone()) {
238+
queue.push(dep.clone());
239+
}
240+
}
241+
}
242+
}
243+
244+
// 3. Generate docs only for reachable messages
245+
let mut docs = Vec::new();
246+
247+
for proto_file in proto_files {
248+
let package = proto_file.package.as_deref().unwrap_or("");
249+
let mut msg_map = HashMap::new();
250+
251+
// Recursively traverse messages to build a map of paths to message names and field numbers.
252+
fn traverse_msg(
253+
msg: &protobuf::descriptor::DescriptorProto,
254+
path: Vec<i32>,
255+
full_name: String,
256+
map: &mut HashMap<Vec<i32>, (String, Vec<u64>)>,
257+
) {
258+
let mut field_numbers = Vec::new();
259+
for field in &msg.field {
260+
field_numbers.push(field.number.unwrap_or(0) as u64);
261+
}
262+
map.insert(path.clone(), (full_name.clone(), field_numbers));
263+
264+
for (k, nested) in msg.nested_type.iter().enumerate() {
265+
let mut nested_path = path.clone();
266+
nested_path.push(3); // 3 is nested_type in DescriptorProto
267+
nested_path.push(k as i32);
268+
let nested_name = format!(
269+
"{}.{}",
270+
full_name,
271+
nested.name.as_deref().unwrap_or("")
272+
);
273+
traverse_msg(nested, nested_path, nested_name, map);
274+
}
275+
}
276+
277+
for (i, msg) in proto_file.message_type.iter().enumerate() {
278+
let msg_name = msg.name.as_deref().unwrap_or("");
279+
let full_name = if package.is_empty() {
280+
msg_name.to_string()
281+
} else {
282+
format!("{}.{}", package, msg_name)
283+
};
284+
traverse_msg(msg, vec![4, i as i32], full_name, &mut msg_map);
285+
}
286+
287+
let source_code_info_ref = proto_file.source_code_info.as_ref();
288+
let source_code_info = match source_code_info_ref {
289+
Some(info) => info,
290+
None => continue,
291+
};
292+
293+
for location in &source_code_info.location {
294+
let path = &location.path;
295+
if path.len() >= 2 && path[path.len() - 2] == 2 {
296+
let field_idx = path[path.len() - 1] as usize;
297+
let msg_path = &path[..path.len() - 2];
298+
299+
if let Some((msg_name, field_numbers)) = msg_map.get(msg_path)
300+
{
301+
if reachable.contains(msg_name)
302+
&& field_idx < field_numbers.len()
303+
{
304+
let field_number = field_numbers[field_idx];
305+
if let Some(comments) = &location.leading_comments {
306+
docs.push((
307+
msg_name.clone(),
308+
field_number,
309+
comments.trim().to_string(),
310+
));
311+
}
312+
}
313+
}
314+
}
315+
}
316+
}
317+
318+
docs.sort();
319+
320+
let mut field_docs_rs = File::create("src/modules/field_docs.rs").unwrap();
321+
322+
writeln!(
323+
field_docs_rs,
324+
"// File generated automatically by build.rs. Do not edit.\n"
325+
)
326+
.unwrap();
327+
328+
writeln!(field_docs_rs, "pub const FIELD_DOCS: &[(&str, u64, &str)] = &[")
329+
.unwrap();
330+
331+
for (msg_name, field_number, comments) in docs {
332+
let escaped_comments = comments.replace("\"", "\\\"");
333+
writeln!(
334+
field_docs_rs,
335+
r#" ("{}", {}, "{}"),"#,
336+
msg_name, field_number, escaped_comments
337+
)
338+
.unwrap();
339+
}
340+
341+
writeln!(field_docs_rs, "];").unwrap();
148342
}
149343

150344
#[cfg(feature = "generate-proto-code")]
@@ -162,6 +356,9 @@ fn generate_proto_code() {
162356
if cfg!(feature = "protoc") {
163357
proto_compiler.protoc();
164358
proto_parser.protoc();
359+
360+
#[cfg(feature = "generate-module-docs")]
361+
proto_parser.protoc_extra_args(["--include_source_info"]);
165362
} else {
166363
proto_compiler.pure();
167364
proto_parser.pure();
@@ -261,9 +458,13 @@ fn generate_proto_code() {
261458
}
262459

263460
if regenerate {
264-
generate_module_files(
265-
proto_parser.file_descriptor_set().unwrap().file,
266-
);
461+
let proto_files = proto_parser.file_descriptor_set().unwrap().file;
462+
463+
#[allow(unused_variables)]
464+
let modules = generate_module_files(&proto_files);
465+
466+
#[cfg(feature = "generate-module-docs")]
467+
generate_module_docs(&proto_files, &modules);
267468

268469
let out_dir = env::var("OUT_DIR").unwrap();
269470
let src_dir = PathBuf::from("src/modules/protos/generated");

lib/src/modules/field_docs.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// File generated automatically by build.rs. Do not edit.
2+
3+
pub const FIELD_DOCS: &[(&str, u64, &str)] = &[
4+
("dex.DexHeader", 2, "DEX version (35, 36, 37, ...)"),
5+
("lnk.Lnk", 1, "True if the file is a LNK file."),
6+
("lnk.Lnk", 2, "A description of the shortcut that is displayed to end users to identify
7+
the purpose of the link."),
8+
("lnk.Lnk", 3, "Time when the LNK file was created."),
9+
("lnk.Lnk", 4, "Time when the LNK file was last accessed."),
10+
("lnk.Lnk", 5, "Time when the LNK files was last modified."),
11+
("lnk.Lnk", 6, "Size of the target file in bytes. The target file is the file that this
12+
link references to. If the link target file is larger than 0xFFFFFFFF,
13+
this value specifies the least significant 32 bits of the link target file
14+
size."),
15+
("lnk.Lnk", 7, "Attributes of the link target file."),
16+
("lnk.Lnk", 8, "Location where the icon associated to the link is found. This is usually
17+
an EXE or DLL file that contains the icon among its resources. The
18+
specific icon to be used is indicated by the `icon_index` field."),
19+
("lnk.Lnk", 9, "Index of the icon that is associated to the link, within an icon location."),
20+
("lnk.Lnk", 10, "Expected window state of an application launched by this link."),
21+
("lnk.Lnk", 11, "Type of drive the link is stored on."),
22+
("lnk.Lnk", 12, "Drive serial number of the volume the link target is stored on."),
23+
("lnk.Lnk", 13, "Volume label of the drive the link target is stored on."),
24+
("lnk.Lnk", 14, "String used to construct the full path to the link target by appending the
25+
common_path_suffix field."),
26+
("lnk.Lnk", 15, "String used to construct the full path to the link target by being appended
27+
to the local_base_path field."),
28+
("lnk.Lnk", 16, "Location of the link target relative to the LNK file."),
29+
("lnk.Lnk", 17, "Path of the working directory to be used when activating the link target."),
30+
("lnk.Lnk", 18, "Command-line arguments that are specified when activating the link target."),
31+
("lnk.Lnk", 19, "Size in bytes of any extra data appended to the LNK file."),
32+
("lnk.Lnk", 20, "Offset within the LNK file where the overlay starts."),
33+
("lnk.Lnk", 21, "Distributed link tracker information."),
34+
("macho.Macho", 1, "Set Mach-O header and basic fields"),
35+
("macho.Macho", 29, "Add fields for Mach-O fat binary header"),
36+
("macho.Macho", 32, "Nested Mach-O files"),
37+
("pe.PE", 16, "Entry point as a file offset."),
38+
("pe.PE", 17, "Entry point as it appears in the PE header (RVA)."),
39+
("pe.Section", 1, "The section's name as listed in the section table. The data type is `bytes`
40+
instead of `string` so that it can accommodate invalid UTF-8 content. The
41+
length is 8 bytes at most."),
42+
("pe.Section", 2, "For section names longer than 8 bytes, the name in the section table (and
43+
in the `name` field) contains a forward slash (/) followed by an ASCII
44+
representation of a decimal number that is an offset into the string table.
45+
(examples: \"/4\", \"/123\") This mechanism is described in the MSDN and used
46+
by GNU compilers.
47+
48+
When this scenario occurs, the `full_name` field holds the actual section
49+
name. In all other cases, it simply duplicates the content of the `name`
50+
field.
51+
52+
See: https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-image_section_header#members"),
53+
("pe.Version", 1, "Major version."),
54+
("pe.Version", 2, "Minor version."),
55+
("test_proto2.TestProto2", 350, "This field will be visible in YARA as `bool_yara` instead of `bool_proto`."),
56+
("test_proto2.TestProto2", 351, "This field won't be visible to YARA."),
57+
("test_proto2.TestProto2", 500, "This field is accessible only if the features \"foo\" (or \"FOO\") and \"bar\"
58+
are enabled while compiling the YARA rules."),
59+
("test_proto2.TestProto2", 502, "The metadata received by the module is copied into this field."),
60+
];

0 commit comments

Comments
 (0)