Skip to content

Commit 5e2854f

Browse files
[#70] Fix analyze CRC for cah:/ resources; add independent --skip-crc
Recognize content-addressed stream paths (cah:/<hash>) produced by Unity 6.6 ContentDirectory builds: fold the path (which contains the content hash) into the CRC instead of opening the differently named resource file, which was failing with "Error opening resource file". Legacy CAB-*.resS/.resource streams still read their bytes for the CRC. Also addresses the performance issues from PR 66 without losing CRC coverage of external streams: - Fix UnityFileReader.ComputeCRC chunking (advance the file offset and handle the partial final chunk) so ranges larger than the buffer no longer produce a wrong CRC or over-read. - Fix the ProcessManagedReferenceData CRC size argument (stringSize + 4, not m_Offset + stringSize + 4). - Keep journal_mode = MEMORY (drop PR 66's ineffective WAL change). Add a --skip-crc option, fully independent of --skip-references: --skip-references now only skips reference extraction and no longer skips the CRC. The reference walk still resolves referenced object ids (so the CRC stays stable) but only inserts refs rows when extracting. Add a ComputeCRC unit test for buffer-boundary ranges and update the analyze documentation for the new flag semantics. Fixes #70.
1 parent cb6cb75 commit 5e2854f

11 files changed

Lines changed: 154 additions & 66 deletions

File tree

Analyzer/AnalyzerTool.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ public int Analyze(
2626
string databaseName,
2727
string searchPattern,
2828
bool skipReferences,
29+
bool skipCrc,
2930
bool verbose,
3031
bool noRecursion)
3132
{
@@ -40,6 +41,7 @@ public int Analyze(
4041
{
4142
parser.Verbose = verbose;
4243
parser.SkipReferences = skipReferences;
44+
parser.SkipCrc = skipCrc;
4345
parser.Init(writer.Connection);
4446

4547
}

Analyzer/PPtrAndCrcProcessor.cs

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,18 @@ public class PPtrAndCrcProcessor : IDisposable
1313
{
1414
public delegate int CallbackDelegate(long objectId, int fileId, long pathId, string propertyPath, string propertyType);
1515

16+
// Content-addressed stream paths (new ContentDirectory build output) look like
17+
// "cah:/<hash>". The hash already identifies the content, so the path itself is
18+
// folded into the CRC instead of opening the (differently named) resource file.
19+
private const string ContentAddressedPrefix = "cah:/";
20+
1621
private SerializedFile m_SerializedFile;
1722
private UnityFileReader m_Reader;
1823
private long m_Offset;
1924
private long m_ObjectId;
2025
private uint m_Crc32;
2126
private string m_Folder;
27+
private bool m_SkipCrc;
2228
private StringBuilder m_StringBuilder = new();
2329
private byte[] m_pptrBytes = new byte[4];
2430

@@ -27,11 +33,12 @@ public class PPtrAndCrcProcessor : IDisposable
2733
private Dictionary<string, UnityFileReader> m_resourceReaders = new();
2834

2935
public PPtrAndCrcProcessor(SerializedFile serializedFile, UnityFileReader reader, string folder,
30-
CallbackDelegate callback)
36+
bool skipCrc, CallbackDelegate callback)
3137
{
3238
m_SerializedFile = serializedFile;
3339
m_Reader = reader;
3440
m_Folder = folder;
41+
m_SkipCrc = skipCrc;
3542
m_Callback = callback;
3643
}
3744

@@ -79,6 +86,32 @@ private UnityFileReader GetResourceReader(string filename)
7986
return reader;
8087
}
8188

89+
// Extends the CRC with a range of the main serialized file, unless CRC is disabled.
90+
private void AppendCrc(long offset, int size)
91+
{
92+
if (!m_SkipCrc)
93+
m_Crc32 = m_Reader.ComputeCRC(offset, size, m_Crc32);
94+
}
95+
96+
// Extends the CRC with the content of an external stream segment (StreamingInfo /
97+
// StreamedResource), unless CRC is disabled. Content-addressed paths fold in the path
98+
// string; other paths read the actual bytes from the companion resource file.
99+
private void AppendStreamCrc(long offset, int size, string path)
100+
{
101+
if (m_SkipCrc)
102+
return;
103+
104+
if (path.StartsWith(ContentAddressedPrefix))
105+
{
106+
m_Crc32 = Crc32Algorithm.Append(m_Crc32, Encoding.UTF8.GetBytes(path));
107+
return;
108+
}
109+
110+
var resourceFile = GetResourceReader(path);
111+
if (resourceFile != null)
112+
m_Crc32 = resourceFile.ComputeCRC(offset, size, m_Crc32);
113+
}
114+
82115
public uint Process(long objectId, long offset, TypeTreeNode node)
83116
{
84117
m_Offset = offset;
@@ -99,7 +132,7 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry)
99132
{
100133
if (node.IsBasicType)
101134
{
102-
m_Crc32 = m_Reader.ComputeCRC(m_Offset, node.Size, m_Crc32);
135+
AppendCrc(m_Offset, node.Size);
103136
m_Offset += node.Size;
104137
}
105138
else if (node.IsArray)
@@ -136,12 +169,7 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry)
136169

137170
if (size > 0)
138171
{
139-
var resourceFile = GetResourceReader(filename);
140-
141-
if (resourceFile != null)
142-
{
143-
m_Crc32 = resourceFile.ComputeCRC(offset, size, m_Crc32);
144-
}
172+
AppendStreamCrc(offset, size, filename);
145173
}
146174
}
147175
else if (node.Type == "StreamedResource")
@@ -162,19 +190,14 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry)
162190

163191
if (size > 0)
164192
{
165-
var resourceFile = GetResourceReader(filename);
166-
167-
if (resourceFile != null)
168-
{
169-
m_Crc32 = resourceFile.ComputeCRC(offset, size, m_Crc32);
170-
}
193+
AppendStreamCrc(offset, size, filename);
171194
}
172195
}
173196
else if (node.CSharpType == typeof(string))
174197
{
175198
var prevOffset = m_Offset;
176199
m_Offset += m_Reader.ReadInt32(m_Offset) + 4;
177-
m_Crc32 = m_Reader.ComputeCRC(prevOffset, (int)(m_Offset - prevOffset), m_Crc32);
200+
AppendCrc(prevOffset, (int)(m_Offset - prevOffset));
178201
}
179202
else if (node.IsManagedReferenceRegistry)
180203
{
@@ -210,12 +233,12 @@ private void ProcessArray(TypeTreeNode node, bool isManagedReferenceRegistry, bo
210233
if (dataNode.IsBasicType)
211234
{
212235
var arraySize = m_Reader.ReadInt32(m_Offset);
213-
m_Crc32 = m_Reader.ComputeCRC(m_Offset, dataNode.Size * arraySize + 4, m_Crc32);
236+
AppendCrc(m_Offset, dataNode.Size * arraySize + 4);
214237
m_Offset += dataNode.Size * arraySize + 4;
215238
}
216239
else
217240
{
218-
m_Crc32 = m_Reader.ComputeCRC(m_Offset, 4, m_Crc32);
241+
AppendCrc(m_Offset, 4);
219242
var arraySize = m_Reader.ReadInt32(m_Offset);
220243
m_Offset += 4;
221244

@@ -239,7 +262,7 @@ private void ProcessArray(TypeTreeNode node, bool isManagedReferenceRegistry, bo
239262

240263
// First child is rid.
241264
long rid = m_Reader.ReadInt64(m_Offset);
242-
m_Crc32 = m_Reader.ComputeCRC(m_Offset, 8, m_Crc32);
265+
AppendCrc(m_Offset, 8);
243266
m_Offset += 8;
244267

245268
ProcessManagedReferenceData(dataNode.Children[1], dataNode.Children[2], rid);
@@ -255,7 +278,7 @@ private void ProcessManagedReferenceRegistry(TypeTreeNode node)
255278

256279
// First child is version number.
257280
var version = m_Reader.ReadInt32(m_Offset);
258-
m_Crc32 = m_Reader.ComputeCRC(m_Offset, node.Children[0].Size, m_Crc32);
281+
AppendCrc(m_Offset, node.Children[0].Size);
259282
m_Offset += node.Children[0].Size;
260283

261284
if (version == 1)
@@ -301,19 +324,19 @@ bool ProcessManagedReferenceData(TypeTreeNode refTypeNode, TypeTreeNode referenc
301324
throw new Exception("Invalid ReferencedManagedType");
302325

303326
var stringSize = m_Reader.ReadInt32(m_Offset);
304-
m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32);
327+
AppendCrc(m_Offset, stringSize + 4);
305328
var className = m_Reader.ReadString(m_Offset + 4, stringSize);
306329
m_Offset += stringSize + 4;
307330
m_Offset = (m_Offset + 3) & ~(3);
308331

309332
stringSize = m_Reader.ReadInt32(m_Offset);
310-
m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32);
333+
AppendCrc(m_Offset, stringSize + 4);
311334
var namespaceName = m_Reader.ReadString(m_Offset + 4, stringSize);
312335
m_Offset += stringSize + 4;
313336
m_Offset = (m_Offset + 3) & ~(3);
314337

315338
stringSize = m_Reader.ReadInt32(m_Offset);
316-
m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32);
339+
AppendCrc(m_Offset, stringSize + 4);
317340
var assemblyName = m_Reader.ReadString(m_Offset + 4, stringSize);
318341
m_Offset += stringSize + 4;
319342
m_Offset = (m_Offset + 3) & ~(3);
@@ -347,11 +370,15 @@ private void ExtractPPtr(string referencedType)
347370
if (fileId != 0 || pathId != 0)
348371
{
349372
var refId = m_Callback(m_ObjectId, fileId, pathId, m_StringBuilder.ToString(), referencedType);
350-
m_pptrBytes[0] = (byte)(refId >> 24);
351-
m_pptrBytes[1] = (byte)(refId >> 16);
352-
m_pptrBytes[2] = (byte)(refId >> 8);
353-
m_pptrBytes[3] = (byte)(refId);
354-
m_Crc32 = Crc32Algorithm.Append(m_Crc32, m_pptrBytes);
373+
374+
if (!m_SkipCrc)
375+
{
376+
m_pptrBytes[0] = (byte)(refId >> 24);
377+
m_pptrBytes[1] = (byte)(refId >> 16);
378+
m_pptrBytes[2] = (byte)(refId >> 8);
379+
m_pptrBytes[3] = (byte)(refId);
380+
m_Crc32 = Crc32Algorithm.Append(m_Crc32, m_pptrBytes);
381+
}
355382
}
356383
}
357384
}

Analyzer/SQLite/Handlers/ISQLiteHandler.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,5 @@ public interface ISQLiteFileParser : IDisposable
2929
void Parse(string filename);
3030
public bool Verbose { get; set; }
3131
public bool SkipReferences { get; set; }
32+
public bool SkipCrc { get; set; }
3233
}

Analyzer/SQLite/Parsers/AddressablesBuildLayoutParser.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ public class AddressablesBuildLayoutParser : ISQLiteFileParser
1515

1616
public bool Verbose { get; set; }
1717
public bool SkipReferences { get; set; }
18+
public bool SkipCrc { get; set; }
1819

1920
public void Dispose()
2021
{

Analyzer/SQLite/Parsers/SerializedFileParser.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ public class SerializedFileParser : ISQLiteFileParser
1515

1616
public bool Verbose { get; set; }
1717
public bool SkipReferences { get; set; }
18+
public bool SkipCrc { get; set; }
1819

1920
public bool CanParse(string filename)
2021
{
@@ -36,7 +37,7 @@ public void Dispose()
3637

3738
public void Init(SqliteConnection db)
3839
{
39-
m_Writer = new SerializedFileSQLiteWriter(db, SkipReferences);
40+
m_Writer = new SerializedFileSQLiteWriter(db, SkipReferences, SkipCrc);
4041
}
4142

4243
public void Parse(string filename)

Analyzer/SQLite/Writers/SerializedFileSQLiteWriter.cs

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ public class SerializedFileSQLiteWriter : IDisposable
1919
private int m_NextAssetBundleId = 0;
2020

2121
private bool m_SkipReferences;
22+
private bool m_SkipCrc;
2223

2324
private IdProvider<string> m_SerializedFileIdProvider = new();
2425
private ObjectIdProvider m_ObjectIdProvider = new();
@@ -54,11 +55,12 @@ public class SerializedFileSQLiteWriter : IDisposable
5455
private SqliteConnection m_Database;
5556
private SqliteCommand m_LastId = new SqliteCommand();
5657
private SqliteTransaction m_CurrentTransaction = null;
57-
public SerializedFileSQLiteWriter(SqliteConnection database, bool skipReferences)
58+
public SerializedFileSQLiteWriter(SqliteConnection database, bool skipReferences, bool skipCrc)
5859
{
5960
m_Initialized = false;
6061
m_Database = database;
6162
m_SkipReferences = skipReferences;
63+
m_SkipCrc = skipCrc;
6264
}
6365

6466
public void Init()
@@ -116,7 +118,7 @@ public void WriteSerializedFile(string relativePath, string fullPath, string con
116118
{
117119
using var sf = UnityFileSystem.OpenSerializedFile(fullPath);
118120
using var reader = new UnityFileReader(fullPath, 64 * 1024 * 1024);
119-
using var pptrReader = new PPtrAndCrcProcessor(sf, reader, containingFolder, AddReference);
121+
using var pptrReader = new PPtrAndCrcProcessor(sf, reader, containingFolder, m_SkipCrc, AddReference);
120122
int serializedFileId = m_SerializedFileIdProvider.GetId(Path.GetFileName(fullPath).ToLower());
121123
int sceneId = -1;
122124

@@ -228,7 +230,10 @@ public void WriteSerializedFile(string relativePath, string fullPath, string con
228230
m_AddObjectCommand.SetValue("game_object", "");
229231
}
230232

231-
if (!m_SkipReferences)
233+
// The walk both extracts references and accumulates the CRC, so it is needed
234+
// unless both are disabled. When CRC is on but references are off, the walk
235+
// still resolves referenced object ids (AddReference skips the insert).
236+
if (!m_SkipReferences || !m_SkipCrc)
232237
{
233238
crc32 = pptrReader.Process(currentObjectId, offset, root);
234239
}
@@ -266,13 +271,20 @@ public void WriteSerializedFile(string relativePath, string fullPath, string con
266271

267272
private int AddReference(long objectId, int fileId, long pathId, string propertyPath, string propertyType)
268273
{
274+
// Always resolve the id so the CRC stays stable; only persist the row when references
275+
// are being extracted.
269276
var referencedObjectId = m_ObjectIdProvider.GetId((m_LocalToDbFileId[fileId], pathId));
270-
m_AddReferenceCommand.SetTransaction(m_CurrentTransaction);
271-
m_AddReferenceCommand.SetValue("object", objectId);
272-
m_AddReferenceCommand.SetValue("referenced_object", referencedObjectId);
273-
m_AddReferenceCommand.SetValue("property_path", propertyPath);
274-
m_AddReferenceCommand.SetValue("property_type", propertyType);
275-
m_AddReferenceCommand.ExecuteNonQuery();
277+
278+
if (!m_SkipReferences)
279+
{
280+
m_AddReferenceCommand.SetTransaction(m_CurrentTransaction);
281+
m_AddReferenceCommand.SetValue("object", objectId);
282+
m_AddReferenceCommand.SetValue("referenced_object", referencedObjectId);
283+
m_AddReferenceCommand.SetValue("property_path", propertyPath);
284+
m_AddReferenceCommand.SetValue("property_type", propertyType);
285+
m_AddReferenceCommand.ExecuteNonQuery();
286+
}
287+
276288
return referencedObjectId;
277289
}
278290

Documentation/analyzer.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ case, Unity will include the asset in all the AssetBundles with a reference to i
4747
view_potential_duplicates provides the number of instances and the total size of the potentially
4848
duplicated assets. It also lists all the AssetBundles where the asset was found.
4949

50-
If the skipReferences option is used, there will be a lot of false positives in that view. Otherwise,
50+
If the `--skip-crc` option is used, there will be a lot of false positives in that view. Otherwise,
5151
it should be very accurate because CRCs are used to determine if objects are identical.
5252

5353
## asset_view (AssetBundleProcessor)

Documentation/command-analyze.md

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ UnityDataTool analyze <path> [options]
1313
| `<path>` | Path to folder containing files to analyze | *(required)* |
1414
| `-o, --output-file <file>` | Output database filename | `database.db` |
1515
| `-p, --search-pattern <pattern>` | File search pattern (`*` and `?` supported) | `*` |
16-
| `-s, --skip-references` | Skip CRC and reference extraction (faster, smaller DB) | `false` |
16+
| `-s, --skip-references` | Do not extract references (smaller DB, no `refs` table). CRC is still computed. | `false` |
17+
| `--skip-crc` | Skip the CRC32 checksum calculation (faster; `objects.crc32` will be 0) | `false` |
1718
| `-v, --verbose` | Show more information during analysis | `false` |
1819
| `--no-recurse` | Do not recurse into sub-directories | `false` |
1920
| `-d, --typetree-data <file>` | Load an external TypeTree data file before processing (Unity 6.5+) ||
@@ -30,9 +31,9 @@ Analyze only `.bundle` files and specify a custom database name:
3031
UnityDataTool analyze /path/to/asset/bundles -o my_database.db -p "*.bundle"
3132
```
3233

33-
Fast analysis (skip reference tracking):
34+
Fastest analysis (skip both reference extraction and CRC):
3435
```bash
35-
UnityDataTool analyze /path/to/bundles -s
36+
UnityDataTool analyze /path/to/bundles --skip-references --skip-crc
3637
```
3738

3839
See also [Analyze Examples](../../Documentation/analyze-examples.md).
@@ -121,23 +122,27 @@ See [Comparing Builds](../../Documentation/comparing-builds.md) for strategies t
121122

122123
### Slow Analyze times, large output database
123124

124-
Consider using the `--skip-references` argument.
125+
Two independent flags reduce analyze time and database size:
125126

126-
A real life analyze of a big Addressables build shows how large a difference this can make:
127+
* `--skip-crc` skips the CRC32 calculation. This is usually the largest time saver, because computing a CRC requires reading the full content of every object, including large texture, mesh and audio data in companion `.resS`/`.resource` files.
128+
* `--skip-references` skips reference extraction, which is the largest contributor to database size (the `refs` table). The references are not needed for core asset inventory and size information.
127129

128-
* 208 seconds and producted a 500MB database (not specifying --skip-reference)
129-
* 9 seconds and produced a 68 MB file (with --skip-reference)
130+
For the fastest, smallest result, combine them.
130131

131-
The references are not needed for core asset inventory and size information.
132+
A real life analyze of a big Addressables build, skipping both references and CRC, shows how large a difference this can make:
132133

133-
Note: When specifying `--skip-reference` some functionality is lost:
134+
* 208 seconds and produced a 500MB database (default)
135+
* 9 seconds and produced a 68 MB file (with `--skip-references --skip-crc`)
136+
137+
When `--skip-references` is used, some functionality is lost:
134138

135139
* the `find-refs` command will not work
136140
* `view_material_shader_refs` and `view_material_texture_refs` will be empty
141+
* `script_object_view` will be empty
137142
* Queries that look at the relationship between objects will not work. For example the refs table is required to link between a `MonoBehaviour` and its `MonoScript`.
138-
* The `objects.crc32` column will be NULL/0 for all objects. This means:
139-
* No detection of identical objects by content hash (See [Comparing Builds](../../Documentation/comparing-builds.md))
140-
* The `view_potential_duplicates` view relies partially on CRC32 to distinguish true duplicates
141143

142-
Future work: The refs table contains a lot of repeated strings and could be made smaller and more efficient. It might also be prudent to control the CRC32 calculation using an independent flag.
144+
When `--skip-crc` is used, the `objects.crc32` column will be 0 for all objects. This means:
145+
146+
* No detection of identical objects by content hash (See [Comparing Builds](../../Documentation/comparing-builds.md))
147+
* The `view_potential_duplicates` view relies partially on CRC32 to distinguish true duplicates
143148

0 commit comments

Comments
 (0)