vulnerablecode/aboutcode/federated/__init__.py at e0e992b8a5e4189dddbfceff5e6bef599d68f56c · aboutcode-org/vulnerablecode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# Copyright (c) AboutCode and others. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about our open source projects.
#

from dataclasses import dataclass
from dataclasses import field as datafield
from hashlib import sha256
from pathlib import Path
from typing import Any
from typing import Iterable
from typing import Optional
from typing import Tuple
from typing import Union
from urllib.parse import quote
from urllib.parse import urlsplit

import requests
import saneyaml
import uritemplate
from packageurl import PackageURL
from packageurl import normalize_qualifiers
from packageurl import normalize_subpath
from packageurl import normalize_version

__version__ = "0.1.0"

"""
Federated data utilities to handle content-defined and hash-addressable Package
Federated data utilities goal is to handle content-defined and hash-addressable
Package data keyed by PURL stored in many Git repositories. This approach to
federate decentralized data is called FederatedCode.


Overview
========

The main design elements are:

1. Data Federation: A Data Federation is a database, representing a consistent,
non-overlapping set of data kind clusters (like scans, vulnerabilities or SBOMs)
across many package ecosystems, aka. PURL types.
A Federation is similar to a traditional database.

2. Data Cluster: A Data Federation contains Data Clusters, where a Data Cluster
purpose is to store the data of a single kind (like scans) across multiple PURL
types. The cluster name is the data kind name and is used as the prefix for
repository names. A Data Cluster is akin to a table in a traditional database.

3. Data Repository: A DataCluster contains of one or more Git Data Repository,
each storing datafiles of the cluster data kind and a one PURL type, spreading
the datafiles in multiple Data Directories. The name is data-kind +PURL-
type+hashid. A Repository is similar to a shard or tablespace in a traditionale
database.

4. Data Directory: In a Repository, a Data Directory contains the datafiles for
PURLs. The directory name PURL-type+hashid

5. Data File: This is a Data File of the DataCluster's Data Kind that is
stored in subdirectories structured after the PURL components::

   namespace/name/version/qualifiers/subpath:

- Either at the level of a PURL name: namespace/name,
- Or at the PURL version level namespace/name/version,
- Or at the PURL qualifiers+PURL subpath level.

A Data File can be for instance a JSON scan results file, or a list of PURLs in
YAML.

For example, a list of PURLs as a Data Kind  would stored at the name
subdirectory level::

    gem-0107/gem/random_password_generator/purls.yml

Or a ScanCode scan as a Data Kind at the version subdirectory level::

    gem-0107/npm/file/3.24.3/scancode.yml


Design
======

The core approach is to distribute the many datafiles for a package in multiple
directories stored in multiple Git repositories, so that each directory and repo
is not too big, with not too many files, and files are spread roughly evenly
across all the directories and repositories.

At the same time the design is such that it is possible to directly access a
single datafile across all these directories and Git repositories knowing only
its package PURL and resolve that to a URL to fetch a single datafile directly
by using the Git web interface (like on GitHub, Gitlab or gitweb)


Why not using a single Git repo?
--------------------------------

We need multiple Git repositories to avoid very big repositories that are
impractical to use. We want each repo to be under the common limits of public
repository hosting services, like GitHub and its 5GB limit. Typicaly a maximum
size of 5GB and a target size of about 1GB of compressed content makes the most
sense. We store text and Git combination of XDiff, XDelta a zlib compression
typically can reduce the stored size by about 5, meaning that a 1GB repo may
contain about 5GB actual uncompressed text.


Why not using a single dir in a repo?
--------------------------------------

Multiple directories are needed to store many package datafiles to avoid
directories with too many files in the same directory, which makes every
filesystem performance suffer. Typically a max of about 10,000 files in a
directory is a decent target.


Hash-based content distribution
-------------------------------

To distribute files roughly evenly across repositories and directories and still
using PURL as a key, we use a hashid derived from a hash computed on the PURL
string and use that to generate repositories and directory names.

It then becomes possible to distribute the data across many Git repositories and
directories evenly and compute a URL and path to access a datafile directly
from a PURL.


Object hierarchy
----------------

- **federation**: defined by its name and a Git repo with a config file with
  clusters configuration for data kind and PURL type parameters, enabling pointing
  to multiple repositories

    - **cluster**: identified by the data kind name, prefixing its data repos

        - **repo**: data repo (Git) identified by datakind+PURL-type+hashid

            - **directory**: dir in a repo, identified by PURL-type+PURL-hashid

                - **PURL path**: ns/name/version/extra_path derived from the PURL

                    - **datafile**: file storing the data as text JSON/YAML/XML

Example
-------

For instance, in the aboutcode data federation, for a cluster about purl
versions, we would have:

- data federation definition git repo, with its config file.
   - aboutcode-data/aboutcode-data
      - aboutcode-federation-config.yml

- data cluster repos name prefix is the data kind
    - aboutcode-data/purls

- data repository git repo, with a purl sub dir tree and datafile.
  The first repo name has a hash of 0000 which is the first PURL hashid of the
  range of PURL hashid stored in this repo's dirs.

    - aboutcode-data/purls-gem-0000/

- data directory, with a purl sub dir tree and datafile. The dir name
  composed of type+hashid.

    - aboutcode-data/purls-gem-0000/gem-0107/

- PURL subdirectory, and datafile, here list of PURLs for the gem named rails:
    - aboutcode-data/purls-gem-0000/gem-0107/rails/purls.yml

In this example, if the base URL for this cluster is at the aboutcode-data
GitHub organization, so the URL to the purls.yml datafile is inferred this way
based on the cluster config::

    https://github.com/
        aboutcode-data/purls-gem-0000/
            raw/refs/heads/main/
                gem-0107/rails/purls.yml


More Design details
===================

The DataCluster and Data kind design aligns with the needs of users: for
example, a user using only vulnerability data for Java and JavaScript may not
care directly for Haskell metadata. Or may care only for another kind of data
like fingerprints.

* DataCluster: A set of repos for only one data kind for many package types.

* Data Kind: Identifier for the kind of data stored in the datafile of
  DataCluster, like PURL versions, or the original API metadata files, or high
  level scans, or scans with file details, reachability slices, fingerprints, or
  vulnerability advisories and so on.

* Repository: A repo is a Git repo that stores a group of Directories of a
  DataCluster/data kind, like for all the npms with a PURL hash of 0000 to 1023,
  where we store npm metadata files for each PURL. All repo names in a cluster
  share the same data-kind prefix.

* Directory: Named after a PURL type and PURL hashid, it stores the datafiles
  for the PURLs that hash to that hashid.


Naming conventions
-------------------

- Federation: like aboutcode-data. Also the name of the config repo.

- DataCluster name prefix: data kind stored in that cluster, like "purls" or "scancode"

- For data repos: data kind + PURL type + PURL hashid like
  purls-npm-0512 or purls-scancode-scans-0000
  The PURL hashid is the first hashid of a range of hashid stored in that repo.

- For data dirs in a repo: PURL type + dir_number like npm-0513 or pypi-0000.
  The hashid is that of the PURLs whose data files are stored in that directory.


PURL Hashid
-----------

The PURL hashid is central to the design and is simply a number between 0 and
1023 (e.g., 1024 values which is a power of two).

It could be updated to up 8192 in the future, but 1024 is good enough to spread
files in multiple dirs.

The Core PURL is a PURL without version, subpath and qualifiers. We hash this
Core PURL as UTF-8-encoded bytes using SHA256.

The first few bytes of the SHA256 binary digest are converted to an integer
using little endian encoding, then converted modulo a max value of 1024 to yield
an integer converted to a 4-chars, zero-padded string between 0000 and 1023.

Based on this hashid and the data kind and PURL type, directories are grouped in
one or more Git reposities of a cluster, based on a cluster-defined number of
directories of a type per Git repo.


Example of repo and dir names
-----------------------------

With 4 dirs per repo, we get 256 repos, like these

purls-npm-0000
   npm-0000
   npm-0001
   npm-0002
   npm-0003

purls-npm-0004
   npm-0004
   npm-0005
   npm-0006
   npm-0007

purls-npm-0008
   npm-0008
   ... and so on


And with 512 dirs per repo, we get 2 repos:

purls-npm-0000
   npm-0000
   npm-0001
   npm-0002
   ...
   npm-0511

purls-npm-0512
   npm-0512
   npm-0513
   ...
   npm-1023


Git repos sizing assumptions for each ecosystems
-------------------------------------------------

For small ecosystems with few packages, like luarocks or swift, a single Git
repo or a few repos may be enough to store all the data of a kind. There, a
luarocks cluster of repos will have a single Git repo, with 1024 root
directories.

At the other end of the spectrum, a package type with many packages like npm may
need 1024 Git repositories to store all the metadata. In this case a npm cluster
of repos will have 1024 Git repos, each with a single root directory.

We can start with reasonable assumptions wrt. the size of each cluster, as a
number of directory per Git repo and the volume of data we would store in each
using these starting values:

1. For super large ecosystems (with ~5M packages):

- one dir per repo, yielding 1,024 repos
- github, npm

2. For large ecosystems (with ~500K packages)

- eight dirs per repo, yielding 128 repos
- golang, maven, nuget, perl, php, pypi, ruby, huggingface

3. For medium ecosystems (with ~50K packages)

- 32 dirs per repo, yielding 32 Git repositories
- alpm, bitbucket, cocoapods, composer, deb, docker, gem, generic,
  mlflow, pub, rpm, cargo

4. For small ecosystem (with ~2K packages)

- 1,024 directories in one git repository
- all others

For instance, say we want a cluster to store all the npm PURLs. As of 2025-10,
npm hosts about 4M unique package names (and roughly 20 versions per name on
average with ~80M updates in total in https://replicate.npmjs.com/). Storing 4M
names takes about 100MB uncompressed. Adding versions would take about 2GB
uncompressed. This means that we can store comfortably all npm PURLs in a single
repository size-wise, but we may want to use more repositories anyway as storing
4M directories and purls.yml files in a single repo will not be a happy event,
so using 32 repos with 32 dirs or 64 repos with 16 dirs may be a better
approach.

See also original post on the approach:
- https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726


Rebalancing and splitting a DataCluster repos
------------------------------------------------

We can rebalance a cluster, like when we first store the data in a cluster with
a single Git repository for a given PURL type, and later split this repo to more
repos, without losing the ability to address datafiles directly just knowing a
PURL and without having to rename all the files and directories.

In this design, the directory names are stable and do not change as long as we
keep the default 1024 hash values for the PURL hashid. The only thing that
changes are the repo names when more repos are created from a split, when the
size of a Git repo grows too large.

When a split to occur, we should perform these operations:

- lock the cluster as "read-only" for the duration of a split operation. This is
  to signal to processes and tool that are updating the cluster that they cannot
  push new data to there yet. This could be done by updating the cluster config
  or the federation config.

- copy existing Git repos to be split to new repos based on the new number of
  directories per repo.

- filter Git history in existing and new repos to keep only the history related
  to the directories stored in a given repo.

- update the cluster config file in cluster Git repo with the new number of
  directories

- push new Git and existing Git repos

- unlock the cluster.

We may need to keep the old and new Clusters around too, and may need to add a
simple DataCluster version suffix in Cluster names, and a way to redirect from an
old frozen, inactive DataCluster to a new rebalanced one.

It may even be possible to continue writing to a cluster as long as writing is
done in two places until the split is completed. In practice split should be
reasonably rare and reasonably fast, making this a lesser issue.

It is also possible to change the PURL hashid range for a DataCluster, say going
from 1024 to 2049, 4096 or 8192. This would imply moving all the files around
are the directory structure would change from the new hashids. This is likely
to be an exceptional operation.

"""

PACKAGE_REPOS_NAME_PREFIX = "aboutcode-packages"

KIND_PURLS_FILENAME = "purls.yml"
KIND_VULNERABILITIES_FILENAME = "vulnerabilities.yml"


def get_package_purls_yml_file_path(purl: Union[PackageURL, str]):
    """
    Return the path to a Package purls.yml YAML for a purl.
    """
    return get_package_base_dir(purl) / KIND_PURLS_FILENAME


def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]):
    """
    Return the path to a Package vulnerabilities.yml YAML for a purl.
    """
    return get_package_base_dir(purl) / KIND_VULNERABILITIES_FILENAME


def get_package_base_dir(purl: Union[PackageURL, str]):
    """
    Return the base path to a Package directory (ignoring version) for a purl
    """
    if isinstance(purl, str):
        purl = PackageURL.from_string(purl)

    path_elements = package_path_elements(purl)
    phash, core_path, _pversion, _extra_path = path_elements
    return Path(f"{PACKAGE_REPOS_NAME_PREFIX}-{purl.type}-{phash}") / core_path


@dataclass
class DataFederation:
    """
    A data federation is the root object and holds the configuration defining its
    data clusters, data kinds, PURL types and data repositories.
    """

    # Hardcoded Aboutcode known "root" federation URL that is the parent all of
    # all Git remote repositories
    ABCD_FED_ROOT_URL = "https://github.com/aboutcode-data"
    # and federation name
    ABCD_FED_NAME = "aboutcode-data"

    CONFIG_FILENAME = "aboutcode-federated-config.yml"

    # name for this federation. Used as the prefix for all repos
    name: str
    # Root dir of all federation local data, like all Git repos checkout.
    local_root_dir: Path = None
    # root URL for all Git repos for this federation
    remote_root_url: str = None
    description: Optional[str] = datafield(default="")
    documentation_url: Optional[str] = datafield(default="")
    # SPDX license expression
    data_license: Optional[str] = datafield(default="")
    data_maintainers: list["DataMaintainer"] = datafield(default_factory=list)

    # List of DataCluster objects
    # Each cluster is for a single, unique data kind in a federation.
    data_clusters: list["DataCluster"] = datafield(default_factory=list, repr=False)

    _data_clusters_by_data_kind: dict[str, "DataCluster"] = datafield(
        default_factory=dict, repr=False, init=False
    )

    def __post_init__(self):
        self.populate_clusters()

    def populate_clusters(self):
        self._data_clusters_by_data_kind = {
            cluster.data_kind: cluster for cluster in self.data_clusters
        }

    def add_cluster(self, cluster):
        self._data_clusters_by_data_kind[cluster.data_kind] = cluster
        self.data_clusters = list(self._data_clusters_by_data_kind.values())

    @property
    def local_config_dir(self):
        # this is also the directory of the config Git repo checkout
        return self.local_root_dir / self.name

    @property
    def local_config_file(self):
        return self.local_config_dir / self.CONFIG_FILENAME

    @classmethod
    def remote_config_file_url(
        cls,
        remote_root_url: str,
        federation_name: str,
    ):
        """Return a URL to directly download the federation config file"""
        return build_direct_federation_config_file_url(
            remote_root_url=remote_root_url,
            federation_name=federation_name,
            config_filename=cls.CONFIG_FILENAME,
        )

    @property
    def config_repo(self) -> "GitRepo":
        """
        Return the GitRepo that contains the configuration for this federation.
        """
        return GitRepo(
            name=self.name,
            local_root_dir=self.local_root_dir,
            remote_root_url=self.remote_root_url,
        )

    @classmethod
    def from_dict(
        cls,
        data: dict,
        local_root_dir: Path = None,
        remote_root_url: str = None,
    ) -> "DataFederation":
        """
        Return a DataFederation from a configuration mapping.
        """
        name = data["name"]

        rru = data.get("remote_root_url")
        if remote_root_url and rru != remote_root_url:
            raise TypeError(f"Inconsistent remote_root_urls: {rru!r} and {remote_root_url!r}")

        data_clusters = data.get("data_clusters") or []

        data_kinds = sorted(c["data_kind"] for c in data_clusters)
        if data_kinds != sorted(set(data_kinds)):
            raise TypeError(f"Duplicated data kinds: {data_kinds}")

        data_clusters = [DataCluster.from_dict(data=cluster) for cluster in data_clusters]

        data_maintainers = data.get("data_maintainers") or []
        data_maintainers = [DataMaintainer(**mnt) for mnt in data_maintainers]

        return cls(
            name=name,
            local_root_dir=local_root_dir and Path(local_root_dir) or None,
            remote_root_url=remote_root_url,
            description=data.get("description"),
            documentation_url=data.get("documentation_url"),
            data_license=data.get("data_license"),
            data_maintainers=data_maintainers,
            data_clusters=data_clusters,
        )

    @classmethod
    def load(cls, name: str, local_root_dir: Path, remote_root_url: str = None) -> "DataFederation":
        """
        Return an existing DataFederation loaded from ``local_root_dir`` using
        the existing configuration file at its conventional location.
        """
        lrd = Path(local_root_dir).resolve()
        lcf = lrd / name / cls.CONFIG_FILENAME
        return cls.from_yaml_config(
            name=name,
            text=lcf.read_text(),
            remote_root_url=remote_root_url,
            local_root_dir=lrd,
        )

    @classmethod
    def from_url(
        cls,
        name: str,
        remote_root_url: str,
        local_root_dir: Path = None,
    ) -> "DataFederation":
        """
        Return a DataFederation loaded from a remote configuration file.
        """
        rcf_url = build_direct_federation_config_file_url(
            remote_root_url=remote_root_url,
            federation_name=name,
            config_filename=cls.CONFIG_FILENAME,
        )
        headers = {"User-Agent": "AboutCode/FederatedCode"}
        response = requests.get(url=rcf_url, headers=headers)
        if not response.ok:
            raise Exception(f"Failed to fetch Federation config: {rcf_url}")

        return cls.from_yaml_config(
            name=name,
            text=response.text,
            remote_root_url=remote_root_url,
            local_root_dir=local_root_dir,
        )

    @classmethod
    def from_yaml_config(
        cls,
        name: str,
        text: str,
        local_root_dir: Path = None,
        remote_root_url: str = None,
    ) -> "DataFederation":
        """
        Return a DataFederation loaded from a YAML configuration text.
        """
        data = saneyaml.load(text)

        if data["name"] != name:
            raise TypeError(
                f"Inconsistent federation name {name!r} " f"with YAML config text: {text!r}"
            )

        lrd = local_root_dir and Path(local_root_dir) or None
        return cls.from_dict(data=data, local_root_dir=lrd, remote_root_url=remote_root_url)

    def to_dict(self):
        """
        Return a mapping for this federation configuration.
        """
        return dict(
            name=self.name,
            remote_root_url=self.remote_root_url,
            description=self.description,
            documentation_url=self.documentation_url,
            data_license=self.data_license,
            data_maintainers=[m.to_dict() for m in self.data_maintainers],
            data_clusters=[dc.to_dict() for dc in self.data_clusters],
        )

    def to_yaml(self):
        """
        Return a YAML text string for this federation configuration.
        """
        return saneyaml.dump(self.to_dict())

    def dump(self):
        """
        Write federation configuration file as YAML.
        """
        if not (lrd := self.local_root_dir):
            raise ValueError(f"Cannot dump without a local_root_dir : {lrd!r}")
        Path(self.local_config_file).write_text(self.to_yaml())

    @classmethod
    def init(cls, name, local_root_dir, remote_root_url=None) -> "DataFederation":
        """
        Initialize a new DataFederation in local_root_dir. Fetch the remote
        config repo if remote_root_url is provided and the repo exists there.
        """
        local_root_dir = Path(local_root_dir).resolve()
        local_config_repo_dir = local_root_dir / name
        # create dir if needed
        # or check if this is a git repo?
        # if not init git repo
        # create basic config and save that in the config file
        if remote_root_url:
            # TODO: clone or sync? repo in local_config_repo_dir
            # raise NotImplementedError("remote_repo_url is not yet supported.")
            pass

        raise NotImplementedError()

    def git_init(self):
        """
        Create all Git repos for this federation as needed. Sets the remote
        if the remote_root_url is defined.
        """
        raise NotImplementedError()

    @classmethod
    def bootstrap(cls, local_root_dir) -> "DataFederation":
        """
        Return the root, seed DataFederation from AboutCode, bootstrapping in
        local_root_dir.
        """
        return DataFederation.init(
            name=cls.ABCD_FED_NAME,
            local_root_dir=local_root_dir,
            remote_root_url=cls.ABCD_FED_ROOT_URL,
        )

    def get_cluster(self, data_kind: str) -> "DataCluster":
        """
        Return a DataCluster for this data kind or None.
        """
        return self._data_clusters_by_data_kind.get(data_kind)

    def get_datafile_download_url(self, data_kind: str, purl: Union[str, PackageURL]) -> Path:
        """
        Return the direct download URL to the data file for a data kind given a
        PURL, or None.
        """
        cluster = self.get_cluster(data_kind=data_kind)
        return cluster.get_datafile_download_url(purl=purl)

    def get_local_datafile(self, data_kind: str, purl: Union[str, PackageURL]) -> "LocalDataFile":
        """
        Return a LocalDataFile for a data kind given a PURL, or None.
        """
        cluster = self.get_cluster(data_kind=data_kind)
        return cluster.get_datafile_local_path(purl=purl)


@dataclass
class LocalDataFile:
    """A local data file stored optionally in a GitRepo"""

    path: Path
    git_repo: "GitRepo" = None


@dataclass(order=True)
class DataCluster:
    """
    AboutCode Federation DataCluster.
    """

    # The name for the data kind stored in this data cluster. There is only one
    # per cluster and the name is unique in a federation.
    # this is the name of cluster
    data_kind: str

    # a URI template to build the path to the datafile for this data kind.
    # this is the path relative to the root of a cluster directory. It does not
    # include directory and repository.
    #
    # For instance for a purls.yml file stored for each package:
    #  {/namespace}/{name}/purls.yml
    #
    # For a scancode.json file stored for each package version:
    #  {/namespace}/{name}/{version}/scancode.json
    datafile_path_template: str

    # list of unique PurlTypeConfig for types stored in this data cluster.
    # "default" is the type that applies to all types not listed here by default
    # and it will be added if not provided.
    purl_type_configs: list["PurlTypeConfig"] = datafield(
        default_factory=list,
        repr=False,
    )

    # JSON or XML schema URL for the file format of this data kind if available
    data_schema_url: Optional[str] = datafield(default="")

    # description of the data kind format, and description of how this data kind
    # is created: which tool, option, etc for instance, a short description of a
    # tool and the tool options, like a scancode toolkit command line option, or
    # the URL to an API whe we fetch API data
    description: Optional[str] = datafield(default="")

    documentation_url: Optional[str] = datafield(default="")

    # SPDX license expression
    data_license: Optional[str] = datafield(default="")

    data_maintainers: list["DataMaintainer"] = datafield(default_factory=list)

    # mapping of {purl_type: DataRepository} for the repos stored in this data
    # cluster. This is auto populated and not serialized in the config file.
    _data_repositories_by_purl_type: dict[str, "DataRepository"] = datafield(
        default_factory=dict,
        init=False,
        repr=False,
    )

    # mapping of {purl_type: PurlTypeConfig} for the repos stored in this data
    # cluster. This is auto populated and not serialized in the config file.
    _configs_by_purl_type: dict[str, "PurlTypeConfig"] = datafield(
        default_factory=dict,
        init=False,
        repr=False,
    )

    def __post_init__(self):
        self.populate_repos()
        self.populate_configs()

    def populate_repos(self):
        """
        Populate the DataRepository for this DataCluster data kind and PurlTypeConfig.
        """
        kind = self.data_kind
        drbpt = self._data_repositories_by_purl_type

        for ptc in self.purl_type_configs:
            drbpt[ptc.purl_type] = [repo for repo in ptc.get_repos(data_kind=kind)]

    def populate_configs(self):
        for ptc in self.purl_type_configs:
            self._configs_by_purl_type[ptc.purl_type] = ptc

    @classmethod
    def from_dict(cls, data: dict) -> "DataCluster":
        ptcs = [PurlTypeConfig(**pt) for pt in data.get("purl_type_configs", [])]

        ptypes = sorted(pt.purl_type for pt in ptcs)
        if ptypes != sorted(set(ptypes)):
            raise ValueError(f"Duplicate purl types: {ptypes!r}")

        if "default" not in ptypes:
            ptcs.append(PurlTypeConfig.default_config())

        data_maintainers = data.get("data_maintainers") or []
        data_maintainers = [DataMaintainer(**mnt) for mnt in data_maintainers]

        return cls(
            data_kind=data["data_kind"],
            datafile_path_template=data.get("datafile_path_template"),
            purl_type_configs=ptcs,
            data_schema_url=data.get("data_schema_url"),
            description=data.get("description"),
            documentation_url=data.get("documentation_url"),
            data_license=data.get("data_license"),
            data_maintainers=data_maintainers,
        )

    def to_dict(self):
        return dict(
            data_kind=self.data_kind,
            datafile_path_template=self.datafile_path_template,
            purl_type_configs=[pt.to_dict() for pt in self.purl_type_configs],
            data_schema_url=self.data_schema_url,
            description=self.description,
            documentation_url=self.documentation_url,
            data_license=self.data_license,
            data_maintainers=[m.to_dict() for m in self.data_maintainers],
        )

    def split_cluster(self, number_of_repos, number_of_dirs):
        """
        Split the repositories of a cluster in more repositories and directories
        """
        raise NotImplementedError()

    def get_datafile_download_url(self, purl: Union[str, PackageURL]) -> str:
        """
        Return the direct download URL to the data file of the data kind stored
        in this cluster given a PURL.
        """
        raise NotImplementedError()

        purl = as_purl(purl)
        # FIXME: create as member
        purl_type_config_by_type = {ptc.purl_type: ptc for ptc in self.purl_type_configs}
        purl_type_config = purl_type_config_by_type(purl.type, self.default_config())

        ppe = package_path_elements(purl, max_value=purl_type_config.number_of_dirs)
        purl_hash, core_path, version, extra_path = ppe

        direct_url = None
        # construct a path based on path template
        # construct a URL
        return direct_url

    def get_local_datafile(self, purl: Union[str, PackageURL]) -> LocalDataFile:
        """
        Return a LocalDataFile of the data kind stored in this cluster given a
        PURL, or None
        """
        raise NotImplementedError()

    def get_config(self, purl_type: str) -> "PurlTypeConfig":
        """
        Return a PurlTypeConfig for this purl type.
        """
        if purl_type not in self._configs_by_purl_type:
            return self._configs_by_purl_type["default"]
        return self._configs_by_purl_type[purl_type]

    def get_datafile_relative_path(self, purl: Union[str, PackageURL]) -> str:
        """
        Return the datfile path relative to the root of a cluster directory
        given a PURL.
        """
        purl = as_purl(purl=purl)

        if not purl.version and "{version}" in self.datafile_path_template:
            raise ValueError(
                f"DataCluster '{self.data_kind}' needs PackageURL with version to generate path."
            )

        template = uritemplate.URITemplate(self.datafile_path_template)
        return template.expand(
            namespace=purl.namespace,
            name=purl.name,
            version=purl.version,
        )

    def get_repo_and_dir_hash(self, purl: Union[str, PackageURL]) -> Tuple[str, str]:
        """
        Return the repository hash and directory hash given a PURL.
        """
        purl = as_purl(purl=purl)
        ptc = self.get_config(purl.type)
        purl_hashid = compute_purl_hash(purl=purl)
        purl_hash = int(purl_hashid)
        repo_hash = purl_hash - (purl_hash % ptc.numbers_of_dirs_per_repo)
        return f"{repo_hash:04}", purl_hashid

    def get_datafile_repo_and_path(self, purl: Union[str, PackageURL]) -> Tuple[str, str]:
        """
        Return the repository name and relative path to the datafile of the data kind stored
        in this cluster given a PURL.
        """
        purl = as_purl(purl)
        repo_hash, dir_hash = self.get_repo_and_dir_hash(purl)
        relative_datafile_path = self.get_datafile_relative_path(purl)

        directory_name = f"{purl.type}-{dir_hash}"
        repository_name = f"{self.data_kind}-{purl.type}-{repo_hash}"
        datafile_path = f"{directory_name}{relative_datafile_path}"

        return repository_name, datafile_path


@dataclass
class PurlTypeConfig:
    """
    Configuration settings for a PURL type stored in a DataCluster
    """

    # Maximum number of dirs we can support
    # at 10Gb per dir, that would support 80TB
    MAX_NUMBER_OF_DIRS = 8192

    # purl type or "default" for a default that applies to all types
    purl_type: str

    # number of repos for this PURL type in a cluster
    number_of_repos: int = 1

    # number of dirs for this PURL type in a cluster. Also defines the max PURL
    # hash value.
    number_of_dirs: int = 1024

    def to_dict(self) -> dict[str, Any]:
        return dict(
            purl_type=self.purl_type,
            number_of_repos=self.number_of_repos,
            number_of_dirs=self.number_of_dirs,
        )

    def __post_init__(self):
        self.number_of_repos = int(self.number_of_repos)
        self.number_of_dirs = int(self.number_of_dirs)

        if not self.number_of_dirs or self.number_of_dirs > self.MAX_NUMBER_OF_DIRS:
            raise TypeError(
                f"number_of_dirs {self.number_of_dirs!r} "
                f"must be between 1 and {self.MAX_NUMBER_OF_DIRS} included"
            )

        if not is_valid_power_of_two(self.number_of_dirs):
            raise TypeError(f"number_of_dirs must be a power of 2, " f"not {self.number_of_dirs!r}")

        if not self.number_of_repos or self.number_of_repos > self.number_of_dirs:
            raise TypeError(
                f"number_of_repos {self.number_of_repos!r} must be between "
                f"1 and {self.number_of_dirs!r}"
            )

        if not is_valid_power_of_two(self.number_of_repos):
            raise TypeError(
                f"number_of_repos must be a power of 2, " f"not {self.number_of_repos!r}"
            )

    @property
    def numbers_of_dirs_per_repo(self) -> int:
        """
        Return the number of directories in each repos for this type.
        It can be any power of 2 from 1 to number_of_dirs (default to 1024)
        """
        return self.number_of_dirs // self.number_of_repos

    @property
    def hashids(self) -> list[str]:
        """
        Return a list of hashid 4-char strings for this PURL type.
        """
        # all possible hashids as 4-char strings padded with zeros
        return [f"{v:04}" for v in range(self.number_of_dirs)]

    def get_repos(self, data_kind: str) -> Iterable["DataRepository"]:
        """
        Yield DataRepository (populated with DataDirectory) for this PURL type.
        """
        purl_type = self.purl_type
        dirs_per_repo = self.numbers_of_dirs_per_repo
        # all possible hashids as 4-char strings padded with zeros
        hashids = self.hashids

        for i in range(0, self.number_of_dirs, dirs_per_repo):
            hashids_of_repo = hashids[i : i + dirs_per_repo]
            yield DataRepository.from_hashids(
                data_kind=data_kind,
                purl_type=purl_type,
                hashids=hashids_of_repo,
            )

    @classmethod
    def default_config(cls) -> "PurlTypeConfig":
        """
        Return the default used when nothing is specified for a type
        """
        return cls(
            purl_type="default",
            number_of_repos=1,
            number_of_dirs=cls.number_of_dirs,
        )

    @classmethod
    def large_size_configs(cls):
        """
        Return a list of initial PurlTypeConfig for common types to be used as
        template when configuring clusters from scratch for storing data of
        large size (scans, etc)
        """

        # This is an initial tiering by type system for storing package metadata
        # where the datafile would be large.
        # The tiers are as follows: