codeql/javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java at c583b480afe7c7e97a217abaa9e6c57c87bee4cd · asgerf/codeql · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package com.semmle.js.extractor;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.lang.ProcessBuilder.Redirect;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryNotEmptyException;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.InvalidPathException;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import com.google.gson.Gson;
import com.google.gson.JsonParseException;
import com.semmle.js.extractor.tsconfig.TsConfigJson;
import com.semmle.js.extractor.tsconfig.CompilerOptions;
import com.semmle.js.dependencies.AsyncFetcher;
import com.semmle.js.dependencies.DependencyResolver;
import com.semmle.js.dependencies.packument.PackageJson;
import com.semmle.js.extractor.ExtractorConfig.SourceType;
import com.semmle.js.extractor.FileExtractor.FileType;
import com.semmle.js.extractor.trapcache.DefaultTrapCache;
import com.semmle.js.extractor.trapcache.DummyTrapCache;
import com.semmle.js.extractor.trapcache.ITrapCache;
import com.semmle.js.parser.ParseError;
import com.semmle.js.parser.ParsedProject;
import com.semmle.ts.extractor.TypeScriptParser;
import com.semmle.ts.extractor.TypeScriptWrapperOOMError;
import com.semmle.util.data.StringUtil;
import com.semmle.util.diagnostic.DiagnosticLevel;
import com.semmle.util.diagnostic.DiagnosticLocation;
import com.semmle.util.diagnostic.DiagnosticWriter;
import com.semmle.util.exception.CatastrophicError;
import com.semmle.util.exception.Exceptions;
import com.semmle.util.exception.ResourceError;
import com.semmle.util.exception.UserError;
import com.semmle.util.extraction.ExtractorOutputConfig;
import com.semmle.util.files.FileUtil;
import com.semmle.util.files.FileUtil8;
import com.semmle.util.io.WholeIO;
import com.semmle.util.io.csv.CSVReader;
import com.semmle.util.language.LegacyLanguage;
import com.semmle.util.process.Env;
import com.semmle.util.process.Env.OS;
import com.semmle.util.projectstructure.ProjectLayout;
import com.semmle.util.trap.TrapWriter;

/**
 * An alternative entry point to the JavaScript extractor.
 *
 * <p>It assumes the following environment variables to be set:
 *
 * <ul>
 *   <li><code>LGTM_SRC</code>: the source root;
 *   <li><code>SEMMLE_DIST</code>: the distribution root.
 * </ul>
 *
 * <p>Additionally, the following environment variables may be set to customise extraction
 * (explained in more detail below):
 *
 * <ul>
 *   <li><code>LGTM_INDEX_INCLUDE</code>: a newline-separated list of paths to include
 *   <li><code>LGTM_INDEX_EXCLUDE</code>: a newline-separated list of paths to exclude
 *   <li><code>LGTM_REPOSITORY_FOLDERS_CSV</code>: the path of a CSV file containing file
 *       classifications
 *   <li><code>LGTM_INDEX_FILTERS</code>: a newline-separated list of strings of form "include:PATTERN"
 *      or "exclude:PATTERN" that can be used to refine the list of files to include and exclude.
 *   <li><code>LGTM_INDEX_TYPESCRIPT</code>: whether to extract TypeScript
 *   <li><code>LGTM_INDEX_FILETYPES</code>: a newline-separated list of ".extension:filetype" pairs
 *       specifying which {@link FileType} to use for the given extension; the additional file type
 *       <code>XML</code> is also supported
 *   <li><code>LGTM_INDEX_XML_MODE</code>: whether to extract XML files
 *   <li><code>LGTM_THREADS</code>: the maximum number of files to extract in parallel
 * </ul>
 *
 * <p>It extracts the following:
 *
 * <ol>
 *   <li>all <code>*.js</code> files under <code>$SEMMLE_DIST/tools/data/externs</code> (cf. {@link
 *       AutoBuild#extractExterns()};
 *   <li>all source code files (cf. {@link AutoBuild#extractSource()}.
 * </ol>
 *
 * <p>In the second step, the set of files to extract is determined in two phases: the walking
 * phase, which computes a set of candidate files, and the filtering phase. A file is extracted if
 * it is a candidate, its type is supported (cf. {@link FileExtractor#supports(File)}), and it is
 * not filtered out in the filtering phase.
 *
 * <p>The walking phase is parameterised by a set of <i>include paths</i> and a set of <i>exclude
 * paths</i>. By default, the single include path is <code>LGTM_SRC</code>. If the environment
 * variable <code>LGTM_INDEX_INCLUDE</code> is set, it is interpreted as a newline-separated list of
 * include paths, which are slash-separated paths relative to <code>LGTM_SRC</code>. This list
 * <i>replaces</i> (rather than extends) the default include path.
 *
 * <p>Similarly, the set of exclude paths is determined by the environment variables <code>
 * LGTM_INDEX_EXCLUDE</code> and <code>LGTM_REPOSITORY_FOLDERS_CSV</code>. The former is interpreted
 * like <code>LGTM_INDEX_EXCLUDE</code>, that is, a newline-separated list of exclude paths relative
 * to <code>LGTM_SRC</code>. The latter is interpreted as the path of a CSV file, where each line in
 * the file consists of a classification tag and an absolute path; any path classified as "external"
 * or "metadata" becomes an exclude path. Note that there are no implicit exclude paths.
 *
 * <p>The walking phase starts at each include path in turn and recursively traverses folders and
 * files. Symlinks and most hidden folders are skipped, but not hidden files. If it encounters a
 * sub-folder whose path is excluded, traversal stops. If it encounters a file, that file becomes a
 * candidate, unless its path is excluded. If the path of a file is both an include path and an
 * exclude path, the inclusion takes precedence, and the file becomes a candidate after all.
 *
 * <p>If an include or exclude path cannot be resolved, a warning is printed and the path is
 * ignored.
 *
 * <p>Note that the overall effect of this procedure is that the precedence of include and exclude
 * paths is derived from their specificity: a more specific include/exclude takes precedence over a
 * less specific include/exclude. In case of a tie, the include takes precedence.
 *
 * <p>The filtering phase is parameterised by a list of include/exclude patterns in the style of
 * {@link ProjectLayout} specifications. There are some built-in include/exclude patterns discussed
 * below. Additionally, the environment variable <code>LGTM_INDEX_FILTERS</code> is interpreted as a
 * newline-separated list of patterns to append to that list (hence taking precedence over the
 * built-in patterns). Unlike for {@link ProjectLayout}, patterns in <code>LGTM_INDEX_FILTERS</code>
 * use the syntax <code>include: pattern</code> for inclusions and <code>exclude: pattern</code> for
 * exclusions.
 *
 * <p>The default inclusion patterns cause the following files to be included:
 *
 * <ul>
 *   <li>All JavaScript files, that is, files with one of the extensions supported by {@link
 *       FileType#JS} (currently ".js", ".jsx", ".mjs", ".cjs", ".es6", ".es", ".xsjs", ".xsjslib").
 *   <li>All HTML files, that is, files with with one of the extensions supported by {@link
 *       FileType#HTML} (currently ".htm", ".html", ".xhtm", ".xhtml", ".vue", ".html.erb", ".html.dot", ".jsp").
 *   <li>All YAML files, that is, files with one of the extensions supported by {@link
 *       FileType#YAML} (currently ".raml", ".yaml", ".yml").
 *   <li>Files with base name "package.json" or "tsconfig.json", and files whose base name
 *       is of the form "codeql-javascript-*.json".
 *   <li>JavaScript, JSON or YAML files whose base name starts with ".eslintrc".
 *   <li>JSON files whose base name is ".xsaccess".
 *   <li>JSON files whose base name is "xs-app.json".
 *   <li>JSON files whose base name ends with ".view.json".
 *   <li>JSON files whose base name is "manifest.json".
 *   <li>All extension-less files.
 * </ul>
 *
 * <p>Additionally, if the environment variable <code>LGTM_INDEX_TYPESCRIPT</code> is set to "basic"
 * or "full" (default), files with one of the extensions supported by {@link FileType#TYPESCRIPT}
 * (currently ".ts" and ".tsx") are also included. In case of "full", type information from the
 * TypeScript compiler is extracted as well.
 *
 * <p>The environment variable <code>LGTM_INDEX_FILETYPES</code> may be set to a newline-separated
 * list of file type specifications of the form <code>.extension:filetype</code>, causing all files
 * whose name ends in <code>.extension</code> to also be included by default.
 *
 * <p>The default exclusion patterns cause the following files to be excluded:
 *
 * <ul>
 *   <li>All JavaScript files whose name ends with <code>-min.js</code> or <code>.min.js</code>.
 *       Such files typically contain minified code. Since LGTM by default does not show results in
 *       minified files, it is not usually worth extracting them in the first place.
 * </ul>
 *
 * <p>JavaScript files are normally extracted with {@link SourceType#AUTO}, but an explicit source
 * type can be specified in the environment variable <code>LGTM_INDEX_SOURCE_TYPE</code>.
 *
 * <p>The file type as which a file is extracted can be customised via the <code>
 * LGTM_INDEX_FILETYPES</code> environment variable explained above.
 *
 * <p>If <code>LGTM_INDEX_XML_MODE</code> is set to <code>ALL</code>, then all files with extension
 * <code>.xml</code> under <code>LGTM_SRC</code> are extracted as XML (in addition to any files
 * whose file type is specified to be <code>XML</code> via <code>LGTM_INDEX_SOURCE_TYPE</code>).
 * Currently XML extraction does not respect inclusion and exclusion filters, but this is a bug, not
 * a feature, and hence will change eventually.
 *
 * <p>Note that all these customisations only apply to <code>LGTM_SRC</code>. Extraction of externs
 * is not customisable.
 *
 * <p>To customise the actual extraction (as opposed to determining which files to extract), the
 * following environment variables are available:
 *
 * <ul>
 *   <li><code>LGTM_THREADS</code> determines how many threads are used for parallel extraction of
 *       JavaScript files (TypeScript files cannot currently be extracted in parallel). If left
 *       unspecified, the extractor uses a single thread.
 *   <li><code>LGTM_TRAP_CACHE</code> and <code>LGTM_TRAP_CACHE_BOUND</code> can be used to specify
 *       the location and size of a trap cache to be used during extraction.
 * </ul>
 */
public class AutoBuild {
  private final ExtractorOutputConfig outputConfig;
  private final ITrapCache trapCache;
  private final Map<String, FileType> fileTypes = new LinkedHashMap<>();
  private final Set<Path> includes = new LinkedHashSet<>();
  private final Set<Path> excludes = new LinkedHashSet<>();
  private final Set<String> xmlExtensions = new LinkedHashSet<>();
  private ProjectLayout filters;
  private final Path LGTM_SRC, SEMMLE_DIST;
  private final String defaultEncoding;
  private ExecutorService threadPool;
  private volatile boolean seenCode = false;
  private volatile boolean seenFiles = false;
  private boolean installDependencies = false;
  private final VirtualSourceRoot virtualSourceRoot;
  private ExtractorState state;
  private final long maximumFileSizeInMegabytes;

  /** The default timeout when installing dependencies, in milliseconds. */
  public static final int INSTALL_DEPENDENCIES_DEFAULT_TIMEOUT = 10 * 60 * 1000; // 10 minutes

  public AutoBuild() {
    this.LGTM_SRC = toRealPath(getPathFromEnvVar("LGTM_SRC"));
    this.SEMMLE_DIST = Paths.get(EnvironmentVariables.getExtractorRoot());
    this.outputConfig = new ExtractorOutputConfig(LegacyLanguage.JAVASCRIPT);
    this.trapCache = ITrapCache.fromExtractorOptions();
    this.defaultEncoding = getEnvVar("LGTM_INDEX_DEFAULT_ENCODING");
    this.installDependencies = Boolean.valueOf(getEnvVar("LGTM_INDEX_TYPESCRIPT_INSTALL_DEPS"));
    this.virtualSourceRoot = makeVirtualSourceRoot();
    this.maximumFileSizeInMegabytes = EnvironmentVariables.getMegabyteCountFromPrefixedEnv("MAX_FILE_SIZE", 10);
    setupFileTypes();
    setupXmlMode();
    setupMatchers();
    this.state = new ExtractorState();
  }

  protected VirtualSourceRoot makeVirtualSourceRoot() {
    return new VirtualSourceRoot(LGTM_SRC, toRealPath(Paths.get(EnvironmentVariables.getScratchDir())));
  }

  private String getEnvVar(String envVarName) {
    return getEnvVar(envVarName, null);
  }

  private String getEnvVar(String envVarName, String deflt) {
    String value = Env.systemEnv().getNonEmpty(envVarName);
    if (value == null) return deflt;
    return value;
  }

  private Path getPathFromEnvVar(String envVarName) {
    String lgtmSrc = getEnvVar(envVarName);
    if (lgtmSrc == null) throw new UserError(envVarName + " must be set.");
    Path path = Paths.get(lgtmSrc);
    return path;
  }

  private <T extends Enum<T>> T getEnumFromEnvVar(
      String envVarName, Class<T> enumClass, T defaultValue) {
    String envValue = getEnvVar(envVarName);
    if (envValue == null) return defaultValue;
    try {
      return Enum.valueOf(enumClass, StringUtil.uc(envValue));
    } catch (IllegalArgumentException ex) {
      Exceptions.ignore(ex, "We rewrite this to a meaningful user error.");
      Stream<String> enumNames =
          Arrays.asList(enumClass.getEnumConstants()).stream()
              .map(c -> StringUtil.lc(c.toString()));
      throw new UserError(
          envVarName + " must be set to one of: " + StringUtil.glue(", ", enumNames.toArray()));
    }
  }

  /**
   * Convert {@code p} to a real path (as per {@link Path#toRealPath(java.nio.file.LinkOption...)}),
   * throwing a {@link ResourceError} if this fails.
   */
  private Path toRealPath(Path p) {
    try {
      return p.toRealPath();
    } catch (IOException e) {
      throw new ResourceError("Could not compute real path for " + p + ".", e);
    }
  }

  private void setupFileTypes() {
    for (String spec : Main.NEWLINE.split(getEnvVar("LGTM_INDEX_FILETYPES", ""))) {
      spec = spec.trim();
      if (spec.isEmpty()) continue;
      String[] fields = spec.split(":");
      if (fields.length != 2) continue;
      String extension = fields[0].trim();
      String fileType = fields[1].trim();
      try {
        fileType = StringUtil.uc(fileType);
        if ("XML".equals(fileType)) {
          if (extension.length() < 2) throw new UserError("Invalid extension '" + extension + "'.");
          xmlExtensions.add(extension.substring(1));
        } else {
          fileTypes.put(extension, FileType.valueOf(fileType));
        }
      } catch (IllegalArgumentException e) {
        Exceptions.ignore(e, "We construct a better error message.");
        throw new UserError("Invalid file type '" + fileType + "'.");
      }
    }
  }

  private void setupXmlMode() {
    String xmlMode = getEnvVar("LGTM_INDEX_XML_MODE", "DISABLED");
    xmlMode = StringUtil.uc(xmlMode.trim());
    if ("ALL".equals(xmlMode)) xmlExtensions.add("xml");
    else if (!"DISABLED".equals(xmlMode))
      throw new UserError("Invalid XML mode '" + xmlMode + "' (should be either ALL or DISABLED).");
  }

  /** Set up include and exclude matchers based on environment variables. */
  private void setupMatchers() {
    setupIncludesAndExcludes();
    setupFilters();
  }

  /**
   * Set up include matchers based on <code>LGTM_INDEX_INCLUDE</code> and <code>
   * LGTM_INDEX_TYPESCRIPT</code>.
   */
  private void setupIncludesAndExcludes() {
    // process `$LGTM_INDEX_INCLUDE` and `$LGTM_INDEX_EXCLUDE`
    boolean seenInclude = false;
    for (String pattern : Main.NEWLINE.split(getEnvVar("LGTM_INDEX_INCLUDE", "")))
      seenInclude |= addPathPattern(includes, LGTM_SRC, pattern);
    if (!seenInclude) includes.add(LGTM_SRC);
    for (String pattern : Main.NEWLINE.split(getEnvVar("LGTM_INDEX_EXCLUDE", "")))
      addPathPattern(excludes, LGTM_SRC, pattern);

    // process `$LGTM_REPOSITORY_FOLDERS_CSV`
    String lgtmRepositoryFoldersCsv = getEnvVar("LGTM_REPOSITORY_FOLDERS_CSV");
    if (lgtmRepositoryFoldersCsv != null) {
      Path path = Paths.get(lgtmRepositoryFoldersCsv);
      try (Reader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8);
          CSVReader csv = new CSVReader(reader)) {
        // skip titles
        csv.readNext();
        String[] fields;
        while ((fields = csv.readNext()) != null) {
          if (fields.length != 2) continue;
          if ("external".equals(fields[0]) || "metadata".equals(fields[0])) {
            String folder = fields[1];
            try {
              Path folderPath =
                  folder.startsWith("file://") ? Paths.get(new URI(folder)) : Paths.get(folder);
              excludes.add(toRealPath(folderPath));
            } catch (InvalidPathException | URISyntaxException | ResourceError e) {
              Exceptions.ignore(e, "Ignore path and print warning message instead");
              warn(
                  "Ignoring '"
                      + fields[0]
                      + "' classification for "
                      + folder
                      + ", which is not a valid path.");
            }
          }
        }
      } catch (IOException e) {
        throw new ResourceError("Unable to process LGTM repository folder CSV.", e);
      }
    }
  }

  private void setupFilters() {
    List<String> patterns = new ArrayList<String>();
    patterns.add("/");

    // exclude all files with extensions
    patterns.add("-**/*.*");

    // but include HTML, JavaScript, YAML and (optionally) TypeScript
    Set<FileType> defaultExtract = new LinkedHashSet<FileType>();
    defaultExtract.add(FileType.HTML);
    defaultExtract.add(FileType.JS);
    defaultExtract.add(FileType.YAML);
    defaultExtract.add(FileType.TYPESCRIPT);
    for (FileType filetype : defaultExtract)
      for (String extension : filetype.getExtensions()) patterns.add("**/*" + extension);

    // include JSON files which are relevant to our analysis
    patterns.add("**/.eslintrc*");
    patterns.add("**/.xsaccess"); // SAP XSJS
    patterns.add("**/xs-app.json"); // SAP XSJS
    patterns.add("**/*.view.json"); // SAP UI5
    patterns.add("**/manifest.json");
    patterns.add("**/package.json");
    patterns.add("**/*tsconfig*.json");
    patterns.add("**/codeql-javascript-*.json");

    // include any explicitly specified extensions
    for (String extension : fileTypes.keySet()) patterns.add("**/*" + extension);

    // exclude files whose name strongly suggests they are minified
    patterns.add("-**/*.min.js");
    patterns.add("-**/*-min.js");

    // exclude `node_modules` and `bower_components`
    patterns.add("-**/node_modules");
    patterns.add("-**/bower_components");

    String base = LGTM_SRC.toString().replace('\\', '/');
    // process `$LGTM_INDEX_FILTERS`
    for (String pattern : Main.NEWLINE.split(getEnvVar("LGTM_INDEX_FILTERS", ""))) {
      pattern = pattern.trim();
      if (pattern.isEmpty()) continue;
      String[] fields = pattern.split(":");
      if (fields.length != 2) continue;
      pattern = fields[1].trim();
      pattern = base + "/" + pattern;
      if ("exclude".equals(fields[0].trim())) pattern = "-" + pattern;
      patterns.add(pattern);
    }

    filters = new ProjectLayout(patterns.toArray(new String[0]));
  }

  /**
   * Add {@code pattern} to {@code patterns}, trimming off whitespace and prepending {@code base} to
   * it. If {@code pattern} ends with a trailing slash, that slash is stripped off.
   *
   * @return true if {@code pattern} is non-empty
   */
  private boolean addPathPattern(Set<Path> patterns, Path base, String pattern) {
    pattern = pattern.trim();
    if (pattern.isEmpty()) return false;
    Path path = base.resolve(pattern);
    try {
      Path realPath = toRealPath(path);
      patterns.add(realPath);
    } catch (ResourceError e) {
      Exceptions.ignore(e, "Ignore exception and print warning instead.");
      warn("Skipping path " + path + ", which does not exist.");
    }
    return true;
  }

  /**
   * Returns whether the autobuilder has seen code.
   * This is overridden in tests.
   */
  protected boolean hasSeenCode() {
    return seenCode;
  }

  /** Perform extraction. */
  public int run() throws IOException {
      startThreadPool();
      try {
        CompletableFuture<?> sourceFuture = extractSource();
        sourceFuture.join(); // wait for source extraction to complete
        if (hasSeenCode() && !isOverlayChangeMode()) { // don't bother with the externs if no code was seen or in overlay change mode
          extractExterns();
        }
        extractXml();
        writeOverlayMetadata();
      } catch (OutOfMemoryError oom) {
        System.err.println("Out of memory while extracting the project.");
        return 137; // the CodeQL CLI will interpret this as an out-of-memory error
        // purpusely not doing anything else (printing stack, etc.), as the JVM
        // basically guarantees nothing after an OOM
      } catch (TypeScriptWrapperOOMError oom) {
        System.err.println("Out of memory while extracting the project.");
        System.err.println(oom.getMessage());
        oom.printStackTrace(System.err);
        return 137;
      } catch (RuntimeException | IOException e) {
        writeDiagnostics("Internal error: " + e, JSDiagnosticKind.INTERNAL_ERROR);
        e.printStackTrace(System.err);
        return 1;
      } finally {
        shutdownThreadPool();
        diagnosticsToClose.forEach(DiagnosticWriter::close);
      }

      if (!hasSeenCode()) {
        if (seenFiles) {
          warn("Only found JavaScript or TypeScript files that were empty or contained syntax errors.");
        } else {
          warn("No JavaScript or TypeScript code found.");
        }
        // ensuring that the finalize steps detects that no code was seen.
        Path srcFolder = Paths.get(EnvironmentVariables.getWipDatabase(), "src");
        try {
          FileUtil8.recursiveDelete(srcFolder);
        } catch (NoSuchFileException e) {
          Exceptions.ignore(e, "the directory did not exist");
        }
        return 0;
      }
    return 0;
  }

  private void writeOverlayMetadata() {
    String file = getEnvVar("CODEQL_EXTRACTOR_JAVASCRIPT_OVERLAY_BASE_METADATA_OUT");
    if (file == null) {
      // no overlay metadata file specified, so nothing to do
      return;
    }
    // Write an empty string to the file as we currently have no metadata to emit.
    // The file must be created for the database to recognized as an overlay base.
    try {
      Files.writeString(Paths.get(file), "", StandardCharsets.UTF_8);
    } catch (IOException e) {
      throw new ResourceError("Could not write overlay metadata to " + file, e);
    }
  }

  /**
   * A kind of error that can happen during extraction of JavaScript or TypeScript
   * code.
   * For use with the {@link #writeDiagnostics(String, JSDiagnosticKind)} method.
   */
  public static enum JSDiagnosticKind {
    PARSE_ERROR("parse-error", "Could not process some files due to syntax errors", DiagnosticLevel.Warning),
    INTERNAL_ERROR("internal-error", "Internal error", DiagnosticLevel.Debug);

    private final String id;
    private final String name;
    private final DiagnosticLevel level;

    private JSDiagnosticKind(String id, String name, DiagnosticLevel level) {
      this.id = id;
      this.name = name;
      this.level = level;
    }

    public String getId() {
      return id;
    }

    public String getName() {
      return name;
    }

    public DiagnosticLevel getLevel() {
      return level;
    }
  }

  private AtomicInteger diagnosticCount = new AtomicInteger(0);
  private List<DiagnosticWriter> diagnosticsToClose = Collections.synchronizedList(new ArrayList<>());
  private ThreadLocal<DiagnosticWriter> diagnostics = new ThreadLocal<DiagnosticWriter>(){
        @Override protected DiagnosticWriter initialValue() {
            DiagnosticWriter result = initDiagnosticsWriter(diagnosticCount.incrementAndGet());
            diagnosticsToClose.add(result);
            return result;
        }
  };

  /**
   * Persist a diagnostic message to a file in the diagnostics directory.
   * See {@link JSDiagnosticKind} for the kinds of errors that can be reported,
   * and see
   * {@link DiagnosticWriter} for more details.
   */
  public void writeDiagnostics(String message, JSDiagnosticKind error) throws IOException {
    writeDiagnostics(message, error, null);
  }


  /**
   * Persist a diagnostic message with a location to a file in the diagnostics directory.
   * See {@link JSDiagnosticKind} for the kinds of errors that can be reported,
   * and see
   * {@link DiagnosticWriter} for more details.
   */
  public void writeDiagnostics(String message, JSDiagnosticKind error, DiagnosticLocation location) throws IOException {
    if (diagnostics.get() == null) {
      warn("No diagnostics directory, so not writing diagnostic: " + message);
      return;
    }

    // DiagnosticLevel level, String extractorName, String sourceId, String sourceName, String markdown
    diagnostics.get().writeMarkdown(error.getLevel(), "javascript", "js/" + error.getId(), error.getName(),
        message, location);
  }

  private DiagnosticWriter initDiagnosticsWriter(int count) {
    String diagnosticsDir = System.getenv("CODEQL_EXTRACTOR_JAVASCRIPT_DIAGNOSTIC_DIR");

    if (diagnosticsDir != null) {
      File diagnosticsDirFile = new File(diagnosticsDir);
      if (!diagnosticsDirFile.isDirectory()) {
        warn("Diagnostics directory " + diagnosticsDir + " does not exist");
      } else {
        File diagnosticsFile = new File(diagnosticsDirFile, "autobuilder-" + count + ".jsonl");
        try {
          return new DiagnosticWriter(diagnosticsFile);
        } catch (FileNotFoundException e) {
          warn("Failed to open diagnostics file " + diagnosticsFile);
        }
      }
    }
    return null;
  }

  private void startThreadPool() {
    int defaultNumThreads = 1;
    int numThreads = Env.systemEnv().getInt("LGTM_THREADS", defaultNumThreads);
    if (numThreads > 1) {
      System.out.println("Parallel extraction with " + numThreads + " threads.");
      threadPool = Executors.newFixedThreadPool(numThreads);
    } else {
      System.out.println("Single-threaded extraction.");
      threadPool = null;
    }
  }

  private void shutdownThreadPool() {
    if (threadPool != null) {
      threadPool.shutdown();
      try {
        threadPool.awaitTermination(365, TimeUnit.DAYS);
      } catch (InterruptedException e) {
        Exceptions.ignore(e, "Awaiting termination is not essential.");
      }
    }
  }

  /** Extract all "*.js" files under <code>$SEMMLE_DIST/tools/data/externs</code> as externs. */
  private void extractExterns() throws IOException {
    ExtractorConfig config = new ExtractorConfig(false).withExterns(true);

    // use explicitly specified trap cache, or otherwise $SEMMLE_DIST/.cache/trap-cache/javascript,
    // which we pre-populate when building the distribution
    ITrapCache trapCache = this.trapCache;
    if (trapCache instanceof DummyTrapCache) {
      Path trapCachePath =
          SEMMLE_DIST.resolve(".cache").resolve("trap-cache").resolve("javascript");
      if (Files.isDirectory(trapCachePath)) {
        trapCache =
            new DefaultTrapCache(trapCachePath.toString(), null, Main.EXTRACTOR_VERSION, false) {
              boolean warnedAboutCacheMiss = false;

              @Override
              public File lookup(String source, ExtractorConfig config, FileType type) {
                File f = super.lookup(source, config, type);
                if (f != null) return f;
                // warn on first failed lookup
                if (!warnedAboutCacheMiss) {
                  warn("Trap cache lookup for externs failed.");
                  warnedAboutCacheMiss = true;
                }
                return null;
              }
            };
      } else {
        warn("No externs trap cache found");
      }
    }

    FileExtractor extractor = new FileExtractor(config, outputConfig, trapCache);
    FileVisitor<? super Path> visitor =
        new SimpleFileVisitor<Path>() {
          @Override
          public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
              throws IOException {
            if (".js".equals(FileUtil.extension(file.toString()))) extract(extractor, file, true);
            return super.visitFile(file, attrs);
          }
        };
    Path externs = SEMMLE_DIST.resolve("tools").resolve("data").resolve("externs");
    Files.walkFileTree(externs, visitor);
  }

  /**
   * Compares files in the order they should be extracted.
   * <p>
   * The ordering of tsconfig.json files can affect extraction results. Since we
   * extract any given source file at most once, and a source file can be included from
   * multiple tsconfig.json files, we sometimes have to choose arbitrarily which tsconfig.json
   * to use for a given file (which is based on this ordering).
   * <p>
   * We sort them to help ensure reproducible extraction. Additionally, deeply nested files are
   * preferred over shallow ones to help ensure files are extracted with the most specific
   * tsconfig.json file.
   */
  public static final Comparator<Path> PATH_ORDERING = new Comparator<Path>() {
    public int compare(Path f1, Path f2) {
      if (f1.getNameCount() != f2.getNameCount()) {
        return f2.getNameCount() - f1.getNameCount();
      }
      return f1.compareTo(f2);
    }
  };

  /**
   * Like {@link #PATH_ORDERING} but for {@link File} objects.
   */
  public static final Comparator<File> FILE_ORDERING = new Comparator<File>() {
    public int compare(File f1, File f2) {
      return PATH_ORDERING.compare(f1.toPath(), f2.toPath());
    }
  };

  public class FileExtractors {
    FileExtractor defaultExtractor;
    Map<String, FileExtractor> customExtractors = new LinkedHashMap<>();

    FileExtractors(FileExtractor defaultExtractor) {
      this.defaultExtractor = defaultExtractor;
    }

    public FileExtractor forFile(Path f) {
      return customExtractors.getOrDefault(FileUtil.extension(f), defaultExtractor);
    }

    public FileType fileType(Path f) {
      return forFile(f).getFileType(f.toFile());
    }
  }

  /** Extract all supported candidate files that pass the filters. */
  private CompletableFuture<?> extractSource() throws IOException {
    // default extractor
    FileExtractor defaultExtractor =
        new FileExtractor(mkExtractorConfig(), outputConfig, trapCache);

    FileExtractors extractors = new FileExtractors(defaultExtractor);

    // custom extractor for explicitly specified file types
    for (Map.Entry<String, FileType> spec : fileTypes.entrySet()) {
      String extension = spec.getKey();
      String fileType = spec.getValue().name();
      ExtractorConfig extractorConfig = mkExtractorConfig().withFileType(fileType);
      extractors.customExtractors.put(extension, new FileExtractor(extractorConfig, outputConfig, trapCache));
    }

    Set<Path> filesToExtract = new LinkedHashSet<>();
    List<Path> tsconfigFiles = new ArrayList<>();
    findFilesToExtract(defaultExtractor, filesToExtract, tsconfigFiles);

    OverlayChanges overlay = getOverlayChanges();
    if (overlay != null) {
      Set<Path> changedFiles = overlay.changes.stream()
          .map(file -> Paths.get(file).toAbsolutePath())
          .collect(Collectors.toSet());
      int before = filesToExtract.size();
      filesToExtract.retainAll(changedFiles);
      int after = filesToExtract.size();
      System.out.println("Overlay filter removed " + (before - after) + " out of " + before + " files from extraction.");
    }

    tsconfigFiles = tsconfigFiles.stream()
         .sorted(PATH_ORDERING)
         .collect(Collectors.toList());

    filesToExtract = filesToExtract.stream()
        .filter(p -> !isFileTooLarge(p))
        .sorted(PATH_ORDERING)
        .collect(Collectors.toCollection(() -> new LinkedHashSet<>()));
    // gather all output directories specified in tsconfig.json files
    final List<Path> outDirs = new ArrayList<>();
    for (Path cfg : tsconfigFiles) {
      try {
        String txt = new WholeIO().read(cfg);
        TsConfigJson root = new Gson().fromJson(txt, TsConfigJson.class);
        if (root != null && root.getCompilerOptions() != null) {
          if (root.getCompilerOptions().getOutDir() == null) {
            // no outDir specified, so skip this tsconfig.json
            continue;
          }
          Path odir = cfg.getParent().resolve(root.getCompilerOptions().getOutDir()).toAbsolutePath().normalize();
          // Only exclude outDirs that are proper subdirectories of the source root
          // This prevents excluding all code when outDir points outside the source root or to the source root itself
          if (tryRelativize(LGTM_SRC, odir) != null && !odir.equals(LGTM_SRC)) {
            outDirs.add(odir);
          }
        }
      } catch (Exception e) {
        // ignore malformed tsconfig or missing fields
      }
    }
    // exclude files in output directories as configured in tsconfig.json
    filesToExtract.removeIf(f -> outDirs.stream().anyMatch(od -> f.startsWith(od)));

    DependencyInstallationResult dependencyInstallationResult = DependencyInstallationResult.empty;
    if (!tsconfigFiles.isEmpty()) {
      dependencyInstallationResult = this.preparePackagesAndDependencies(filesToExtract);
    }
    Set<Path> extractedFiles = new LinkedHashSet<>();

    // Extract HTML files as they may contain TypeScript
    CompletableFuture<?> htmlFuture = extractFiles(
        filesToExtract, extractedFiles, extractors,
        f -> extractors.fileType(f) == FileType.HTML);

    htmlFuture.join(); // Wait for HTML extraction to be finished.

    // extract TypeScript projects and files
    extractTypeScript(filesToExtract, extractedFiles,
              extractors, tsconfigFiles, dependencyInstallationResult);

    boolean hasTypeScriptFiles = hasTypeScriptFiles(filesToExtract);

    // extract remaining files
    return extractFiles(
        filesToExtract, extractedFiles, extractors,
        f -> !(hasTypeScriptFiles && isFileDerivedFromTypeScriptFile(f, extractedFiles)));
  }

  private CompletableFuture<?> extractFiles(
      Set<Path> filesToExtract,
      Set<Path> extractedFiles,
      FileExtractors extractors,
      Predicate<Path> shouldExtract) {

    List<CompletableFuture<?>> futures = new ArrayList<>();
    for (Path f : filesToExtract) {
      if (extractedFiles.contains(f))
        continue;
      if (!shouldExtract.test(f)) {
        continue;
      }
      extractedFiles.add(f);
      futures.add(extract(extractors.forFile(f), f, true));
    }
    return CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]));
  }

  /**
   * Returns true if the given path is likely the output of compiling a TypeScript file
   * which we have already extracted.
   */
  private boolean isFileDerivedFromTypeScriptFile(Path path, Set<Path> extractedFiles) {
    String name = path.getFileName().toString();
    // only skip JS variants when a corresponding TS/TSX file was already extracted
    if (!(name.endsWith(".js")
          || name.endsWith(".cjs")
          || name.endsWith(".mjs")
          || name.endsWith(".jsx")
          || name.endsWith(".cjsx")
          || name.endsWith(".mjsx"))) {
      return false;
    }
    // strip off extension
    int dot = name.lastIndexOf('.');
    String stem = dot != -1 ? name.substring(0, dot) : name;
    // if a TS/TSX file with same base name was extracted, skip this file
    for (String ext : FileType.TYPESCRIPT.getExtensions()) {
      if (extractedFiles.contains(path.getParent().resolve(stem + ext))) {
        return true;
      }
    }
    return false;
  }

  /**
   * Returns an existing file named <code>dir/stem.ext</code> where <code>.ext</code> is any
   * of the given extensions, or <code>null</code> if no such file exists.
   */
  private static Path tryResolveWithExtensions(Path dir, String stem, Iterable<String> extensions) {
    for (String ext : extensions) {
      Path path = dir.resolve(stem + ext);
      if (Files.exists(dir.resolve(path))) {
        return path;
      }
    }
    return null;
  }

  /**
   * Returns an existing file named <code>dir/stem.ext</code> where <code>ext</code> is any TypeScript or JavaScript extension,
   * or <code>null</code> if no such file exists.
   */
  private static Path tryResolveTypeScriptOrJavaScriptFile(Path dir, String stem) {
    Path resolved = tryResolveWithExtensions(dir, stem, FileType.TYPESCRIPT.getExtensions());
    if (resolved != null) return resolved;
    return tryResolveWithExtensions(dir, stem, FileType.JS.getExtensions());
  }

  /**
   * Gets a relative path from <code>from</code> to <code>to</code> provided
   * the latter is contained in the former. Otherwise returns <code>null</code>.
   * @return a path or null
   */
  public static Path tryRelativize(Path from, Path to) {
    Path relative = from.relativize(to);
    if (relative.startsWith("..") || relative.isAbsolute()) {
      return null;
    }
    return relative;
  }

  /**
   * Prepares <code>package.json</code> files in a virtual source root, and, if enabled,
   * installs dependencies for use by the TypeScript type checker.
   * <p>
   * Some packages must be downloaded while others exist within the same repo ("monorepos")
   * but are not in a location where TypeScript would look for it.
   * <p>
   * Downloaded packages are intalled under <code>SCRATCH_DIR</code>, in a mirrored directory hierarchy
   * we call the "virtual source root".
   * <p>
   * Packages that exists within the repo are not downloaded. Since they are part of the main source tree,
   * these packages are not mirrored under the virtual source root.
   * Instead, an explicit package location mapping is passed to the TypeScript parser wrapper.
   * <p>
   * The TypeScript parser wrapper then overrides module resolution so packages can be found
   * under the virtual source root and via that package location mapping.
   */
protected DependencyInstallationResult preparePackagesAndDependencies(Set<Path> filesToExtract) {
    final Path sourceRoot = LGTM_SRC;

    // Read all package.json files and index them by name.
    Map<Path, PackageJson> packageJsonFiles = new LinkedHashMap<>();
    Map<String, Path> packagesInRepo = new LinkedHashMap<>();
    Map<String, Path> packageMainFile = new LinkedHashMap<>();
    for (Path file : filesToExtract) {
      if (file.getFileName().toString().equals("package.json")) {
        try {
          PackageJson packageJson = new Gson().fromJson(new WholeIO().read(file), PackageJson.class);
          if (packageJson == null) {
            continue;
          }
          file = file.toAbsolutePath();
          if (tryRelativize(sourceRoot, file) == null) {
            continue; // Ignore package.json files outside the source root.
          }
          packageJsonFiles.put(file, packageJson);

          String name = packageJson.getName();
          if (name != null) {
            packagesInRepo.put(name, file);
          }
        } catch (JsonParseException e) {
          System.err.println("Could not parse JSON file: " + file);
          System.err.println(e);
          // Continue without the malformed package.json file
        }
      }
    }

    // Guess the main file for each package.
    packageJsonFiles.forEach(
      (path, packageJson) -> {
          Path relativePath = sourceRoot.relativize(path);
          // For named packages, find the main file.
          String name = packageJson.getName();
          if (name != null) {
            Path entryPoint = null;
            try {
              entryPoint = guessPackageMainFile(path, packageJson, FileType.TYPESCRIPT.getExtensions());
              if (entryPoint == null) {
                // Try a TypeScript-recognized JS extension instead
                entryPoint = guessPackageMainFile(path, packageJson, Arrays.asList(".js", ".jsx"));
              }
            } catch (InvalidPathException ignore) {
              // can happen if the `main:` field is invalid. E.g. on Windows a path like `dist/*.js` will crash.
            }
            if (entryPoint != null) {
              System.out.println(relativePath + ": Main file set to " + sourceRoot.relativize(entryPoint));
              packageMainFile.put(name, entryPoint);
            } else {
              System.out.println(relativePath + ": Main file not found");
            }
          }
        });

    if (installDependencies) {
      // Use more threads for dependency installation than for extraction, as this is mainly I/O bound and we want
      // many concurrent HTTP requests.
      ExecutorService installationThreadPool = Executors.newFixedThreadPool(50);
      AsyncFetcher fetcher = new AsyncFetcher(installationThreadPool, err -> { System.err.println(err); });
      try {
        List<CompletableFuture<Void>> futures = new ArrayList<>();
        packageJsonFiles.forEach((file, packageJson) -> {
          Path virtualFile = virtualSourceRoot.toVirtualFile(file);
          Path nodeModulesDir = virtualFile.getParent().resolve("node_modules");
          futures.add(new DependencyResolver(fetcher, packagesInRepo.keySet()).installDependencies(packageJson, nodeModulesDir));
        });
        CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
      } finally {
        installationThreadPool.shutdown();
        try {
          installationThreadPool.awaitTermination(1, TimeUnit.HOURS);
        } catch (InterruptedException e) {
          Exceptions.ignore(e, "Awaiting termination is not essential.");
        }
      }
    }

    return new DependencyInstallationResult(packageMainFile, packagesInRepo);