-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathGitDiffer.java
More file actions
397 lines (356 loc) · 18 KB
/
GitDiffer.java
File metadata and controls
397 lines (356 loc) · 18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
package org.variantsync.diffdetective.diff.git;
import org.eclipse.jgit.api.Git;
import org.eclipse.jgit.diff.DiffEntry;
import org.eclipse.jgit.diff.DiffFormatter;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.revwalk.RevTree;
import org.eclipse.jgit.revwalk.RevWalk;
import org.eclipse.jgit.treewalk.*;
import org.eclipse.jgit.treewalk.filter.PathFilter;
import org.tinylog.Logger;
import org.variantsync.diffdetective.datasets.Repository;
import org.variantsync.diffdetective.diff.result.CommitDiffResult;
import org.variantsync.diffdetective.diff.result.DiffError;
import org.variantsync.diffdetective.diff.result.DiffParseException;
import org.variantsync.diffdetective.util.Assert;
import org.variantsync.diffdetective.util.StringUtils;
import org.variantsync.diffdetective.variation.DiffLinesLabel;
import org.variantsync.diffdetective.variation.diff.VariationDiff;
import org.variantsync.diffdetective.variation.diff.parse.VariationDiffParser;
import org.variantsync.diffdetective.variation.tree.source.GitSource;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class provides utility functions for obtaining diffs from git {@link Repository repositories}.
* <p>
* Then a {@link CommitDiff} is created for each commit.
* File changes in each commit are filtered using the {@link DiffFilter} of the {@link Repository#getDiffFilter() repository}.
* Then a {@link PatchDiff} is created from each file change.
* Finally, each patch is parsed to a {@link VariationDiff}.
*
* @author Soeren Viegener, Paul Maximilian Bittner
*/
public class GitDiffer {
private static final Pattern BOM_PATTERN = Pattern.compile("\\x{FEFF}");
private static final Pattern DIFF_HUNK_PATTERN = Pattern.compile( "^@@\\s-(\\d+).*\\+(\\d+).*@@$");
private static final Pattern GIT_HEADER_PATTERN = Pattern.compile( "^diff --git .*$", Pattern.MULTILINE);
private static final Pattern DIFF_HEADER_PATTERN = Pattern.compile( "^\\+\\+\\+.*$", Pattern.MULTILINE);
private static final Pattern NO_NEWLINE_PATTERN = Pattern.compile(
"(" + StringUtils.LINEBREAK_REGEX.pattern() + ")(?m)\\\\ No newline at end of file$");
private GitDiffer() {
}
public static CommitDiffResult createCommitDiffFromFirstParent(
Repository repository,
String commitHash) throws IOException {
return createCommitDiffFromFirstParent(repository, repository.getCommit(commitHash));
}
/**
* Creates a CommitDiff from a given commit.
* For this, the git diff is retrieved using JGit.
* For each file in the diff, a PatchDiff is created.
* <p>
* This honors the {@link Repository#getDiffFilter() diff filter}
* and the {@link Repository#getParseOptions() parser options} of the repository.
*
* @param repository The git repo which the commit stems from.
* @param currentCommit The commit from which to create a CommitDiff
* @return The CommitDiff of the given commit
*/
public static CommitDiffResult createCommitDiffFromFirstParent(
Repository repository,
RevCommit currentCommit) {
final RevCommit parent;
if (currentCommit.getParentCount() > 0) {
try (var revWalk = new RevWalk(repository.getGitRepo().getRepository())) {
parent = revWalk.parseCommit(currentCommit.getParent(0).getId());
} catch (IOException e) {
return CommitDiffResult.Failure(DiffError.JGIT_ERROR, "Could not parse parent commit of " + currentCommit.getId().getName() + "!");
}
} else {
parent = null;
}
return createCommitDiff(repository, parent, currentCommit);
}
/**
* Creates a CommitDiff that describes all changes made by the
* given childCommit to the given parentCommit.
* <p>
* This honors the {@link Repository#getDiffFilter() diff filter}
* and the {@link Repository#getParseOptions() parser options} of the repository.
*
* @param repository The git repo which the commit stems from.
* @return The CommitDiff describing all changes between the two commits.
*/
public static CommitDiffResult createCommitDiff(
Repository repository,
RevCommit parentCommit,
RevCommit childCommit) {
if (childCommit.getTree() == null) {
return CommitDiffResult.Failure(DiffError.JGIT_ERROR, "Could not obtain RevTree from child commit " + childCommit.getId());
}
if (parentCommit != null && parentCommit.getTree() == null) {
return CommitDiffResult.Failure(DiffError.JGIT_ERROR, "Could not obtain RevTree from parent commit " + parentCommit.getId());
}
// get TreeParsers
final CanonicalTreeParser currentTreeParser = new CanonicalTreeParser();
final CanonicalTreeParser prevTreeParser = new CanonicalTreeParser();
try (ObjectReader reader = repository.getGitRepo().getRepository().newObjectReader()) {
currentTreeParser.reset(reader, childCommit.getTree());
if (parentCommit != null) {
prevTreeParser.reset(reader, parentCommit.getTree());
}
} catch (IOException e) {
return CommitDiffResult.Failure(DiffError.JGIT_ERROR, e.toString());
}
final AbstractTreeIterator parentTreeIterator;
if (parentCommit == null) {
parentTreeIterator = new EmptyTreeIterator();
} else {
parentTreeIterator = prevTreeParser;
}
return getPatchDiffs(
repository,
parentTreeIterator,
currentTreeParser,
parentCommit,
childCommit
);
}
/**
* The same as {@link #createCommitDiff(Repository, RevCommit, RevCommit)}
* but diffs the given commit against the current working tree.
*
* @param repository The git repo which the commit stems from
* @param commit The commit which the working tree is compared with
* @return The CommitDiff of the given commit
*/
public static CommitDiffResult createWorkingTreeDiff(
Repository repository,
RevCommit commit) {
if (commit != null && commit.getTree() == null) {
return CommitDiffResult.Failure(DiffError.JGIT_ERROR, "Could not obtain RevTree from child commit " + commit.getId());
}
// get TreeParsers
final AbstractTreeIterator workingTreeIterator = new FileTreeIterator(repository.getGitRepo().getRepository());
final AbstractTreeIterator prevTreeIterator;
if (commit == null) {
prevTreeIterator = new EmptyTreeIterator();
} else try (ObjectReader reader = repository.getGitRepo().getRepository().newObjectReader()) {
prevTreeIterator = new CanonicalTreeParser(null, reader, commit.getTree());
} catch (IOException e) {
return CommitDiffResult.Failure(DiffError.JGIT_ERROR, e.toString());
}
return getPatchDiffs(repository, prevTreeIterator, workingTreeIterator, commit, commit);
}
/**
* Obtains the CommitDiff between two commit's trees.
* <p>
* This honors the {@link Repository#getDiffFilter() diff filter}
* and the {@link Repository#getParseOptions() parser options} of the repository.
*
* @param repository The git repo which the commit stems from
* @param prevTreeParser The tree parser for parentCommit
* @param currentTreeParser The tree parser for childCommit or the working tree
* @param parentCommit The {@link RevCommit} for the parent commit
* @param childCommit The {@link RevCommit} for the child commit (equal to parentCommit if working tree is requested)
* @return {@link CommitDiffResult}
*/
private static CommitDiffResult getPatchDiffs(
Repository repository,
AbstractTreeIterator prevTreeParser,
AbstractTreeIterator currentTreeParser,
RevCommit parentCommit,
RevCommit childCommit) {
final CommitDiff commitDiff = new CommitDiff(childCommit, parentCommit);
final List<DiffError> errors = new ArrayList<>();
// get PatchDiffs
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
DiffFormatter diffFormatter = new DiffFormatter(outputStream))
{
diffFormatter.setRepository(repository.getGitRepo().getRepository());
diffFormatter.setDetectRenames(true);
diffFormatter.getRenameDetector().setRenameScore(50);
List<DiffEntry> entries = diffFormatter.scan(prevTreeParser, currentTreeParser);
for (DiffEntry diffEntry : entries) {
if (!repository.getDiffFilter().filter(diffEntry)) {
continue;
}
diffFormatter.format(diffEntry);
final String gitDiff = outputStream.toString(StandardCharsets.UTF_8);
final String filename = diffEntry.getOldPath();
final Matcher matcher = DIFF_HEADER_PATTERN.matcher(gitDiff);
final String strippedDiff;
if (matcher.find()) {
strippedDiff = gitDiff.substring(matcher.end() + 1);
} else if (GIT_HEADER_PATTERN.matcher(gitDiff).find()) {
// Check whether it is a diff returned by `git diff` and not one created by some other means
strippedDiff = "";
} else {
// It is a diff from another source (e.g., manually created or copy-pasted from GitHub)
strippedDiff = gitDiff;
}
try {
String fullDiff = switch (diffEntry.getChangeType()) {
case ADD, DELETE -> {
if (strippedDiff.isEmpty()) {
// Addition or deletion of an empty file
yield "";
}
// The first lines contains meta information "@@ ... " that we want to skip.
final String[] hunkBeginAndRest = StringUtils.LINEBREAK_REGEX.split(strippedDiff, 2);
Assert.assertTrue(hunkBeginAndRest.length == 2, "Hunk is only one line. Is it a hunk? Hunk: " + strippedDiff);
yield hunkBeginAndRest[1];
}
case RENAME, COPY, MODIFY -> {
final BufferedReader beforeFullFile = getBeforeFullFile(repository, parentCommit, filename);
yield getFullDiff(beforeFullFile, new BufferedReader(new StringReader(strippedDiff)));
}
};
// Iff a file does not end with a newline character, git adds a meta-line to the diff, which states
// the absence of the newline. If this is the case, we remove the meta-line, in order not to
// parse it as artifact line. If the meta-line does not exist, we add a newline, which adds an empty
// line to the end of the diff. Without this empty line, we would loose the information about the
// newline during the next parse step, which splits the text into lines and removes all newline
// characters.
// TODO: In future versions, we might want to track the newline more explicitly
final Matcher newlineMatcher = NO_NEWLINE_PATTERN.matcher(fullDiff);
if (newlineMatcher.find()) {
fullDiff = newlineMatcher.replaceAll("");
} else {
fullDiff += StringUtils.LINEBREAK;
}
final VariationDiff<DiffLinesLabel> variationDiff = VariationDiffParser.createVariationDiff(
fullDiff,
new GitSource(repository, childCommit.getId().name(), Path.of(filename)),
repository.getParseOptions().variationDiffParseOptions()
);
// not storing the full diff reduces memory usage by around 40-50%
final String diffToRemember = switch (repository.getParseOptions().diffStoragePolicy()) {
case DO_NOT_REMEMBER -> "";
case REMEMBER_DIFF -> gitDiff;
case REMEMBER_FULL_DIFF -> fullDiff;
case REMEMBER_STRIPPED_DIFF -> strippedDiff;
};
commitDiff.addPatchDiff(new PatchDiff(
commitDiff,
diffEntry,
diffToRemember,
variationDiff
));
} catch (IOException e) {
Logger.debug(e, "Could not obtain full diff of file " + filename + " before commit " + parentCommit + "!");
errors.add(DiffError.COULD_NOT_OBTAIN_FULLDIFF);
} catch (DiffParseException e) {
StringBuilder logMessage = new StringBuilder();
logMessage
.append("parse exception for commit ")
.append(childCommit.getName())
.append(" in line ")
.append(e.getLineNumber())
.append(" of patch ")
.append(diffEntry.getOldPath())
.append(" -> ")
.append(diffEntry.getNewPath())
.append("\n");
logMessage
.append(e)
.append("\n");
if (e.getMessage() != null) {
logMessage
.append("error message: ")
.append(e.getMessage())
.append("\n");
}
if (e.getCause() != null) {
logMessage
.append("cause: ")
.append(e.getCause());
}
Logger.debug(logMessage);
errors.add(e.getError());
}
outputStream.reset();
}
} catch (IOException e) {
return CommitDiffResult.Failure(DiffError.JGIT_ERROR, e.toString());
}
return new CommitDiffResult(Optional.of(commitDiff), errors);
}
/**
* Creates a full git diff from a file before the change and the git diff containing only the
* changed lines.
*
* @param beforeFile The full file before the change
* @param gitDiff The git diff containing only the changed lines
* @return A full git diff containing the complete file and all changes
*/
public static String getFullDiff(BufferedReader beforeFile, BufferedReader gitDiff) {
try {
LineNumberReader before = new LineNumberReader(beforeFile);
List<String> fullDiffLines = new ArrayList<>();
String diffLine;
while ((diffLine = gitDiff.readLine()) != null) {
Matcher matcher = DIFF_HUNK_PATTERN.matcher(diffLine);
if (matcher.find()) {
// found diffHunkRegex
// subtract 1 because line numbers start at 1
int beforeDiffIndex = Integer.parseInt(matcher.group(1)) - 1;
while (before.getLineNumber() < beforeDiffIndex) {
fullDiffLines.add(" " + before.readLine());
}
} else {
fullDiffLines.add(diffLine);
if (!diffLine.startsWith("+")) {
before.readLine();
}
}
}
String beforeLine;
while ((beforeLine = before.readLine()) != null) {
fullDiffLines.add(" " + beforeLine);
}
String fullDiff = String.join(StringUtils.LINEBREAK, fullDiffLines);
// JGit seems to put BOMs in weird locations somewhere in the files
// We need to remove those or the regex matching for the lines fails
fullDiff = BOM_PATTERN.matcher(fullDiff).replaceAll("");
return fullDiff;
} catch (IOException e) {
// Going up the call chain, at can be seen, that all callers need functions which do
// not throw any checked exception, so just rethrow an unchecked one.
throw new UncheckedIOException(e);
}
}
/**
* Gets the full content of a file before a commit.
*
* @param repository The repository which contains {@code commit}
* @param commit The commit in which the file was changed
* @param filename The name of the file
* @return The full content of the file before the commit
*/
public static BufferedReader getBeforeFullFile(Repository repository, RevCommit commit, String filename) throws IOException {
Git git = repository.getGitRepo();
RevTree tree = commit.getTree();
try (TreeWalk treeWalk = new TreeWalk(git.getRepository())) {
treeWalk.addTree(tree);
treeWalk.setRecursive(true);
treeWalk.setFilter(PathFilter.create(filename));
// Look for the first file that matches filename.
if (!treeWalk.next()) {
throw new FileNotFoundException("Couldn't find " + filename + " in the commit " + commit);
}
ObjectId objectId = treeWalk.getObjectId(0);
ObjectLoader loader = git.getRepository().open(objectId);
return new BufferedReader(new InputStreamReader(loader.openStream()));
}
}
}