Support third_party.identifier in METADATA files of external packages.

Bug: 303688820
Test: CIs
Test: "m sbom" after lunch
Change-Id: Ic329d87cdcfbe4152b0fe6a8fd71c4867593b674
diff --git a/tools/protos/metadata_file.proto b/tools/protos/metadata_file.proto
index ac1129a..47562c5 100644
--- a/tools/protos/metadata_file.proto
+++ b/tools/protos/metadata_file.proto
@@ -92,6 +92,8 @@
     SBOMRef sbom_ref = 10;
   }
 
+  // Identifiers for the package.
+  repeated Identifier identifier = 11;
 }
 
 // URL associated with a third-party package.
@@ -278,4 +280,136 @@
   // https://spdx.github.io/spdx-spec/v2.3/package-information/#72-package-spdx-identifier-field or
   // https://spdx.github.io/spdx-spec/v2.3/file-information/#82-file-spdx-identifier-field
   optional string element_id = 3;
+}
+
+// Identifier for a third-package package.
+// See go/tp-metadata-id.
+message Identifier {
+  // The type of the identifier. Either an "ecosystem" value from
+  // https://ossf.github.io/osv-schema/#affectedpackage-field such as "Go",
+  // "npm" or "PyPI". The "value" and "version" fields follow the same rules as
+  // defined in the OSV spec.
+
+  // Or one of:
+  //  - "Git": The "value" field is the URL of the upstream git repository this
+  //  package is retrieved from.
+  //  For example:
+  //   - https://github.com/git/git
+  //   - git://git.kernel.org/pub/scm/git/git
+  //
+  //  Use of a git URL requires that the package "version" value must specify a
+  //  specific git tag or revision. This must not be a branch name.
+  //
+  //  - "SVN": The "value" field is the URL of the upstream SVN repository this
+  //  package is retrieved from.
+  //  For example:
+  //   - http://llvm.org/svn/llvm-project/llvm/
+  //
+  //  Use of an SVN URL requires that the package "version" value must specify
+  //  a specific SVN tag or revision. This must not be a branch name.
+  //
+  //  - "Hg": The "value" field is the URL of the upstream mercurial repository
+  //  this package is retrieved from.
+  //  For example:
+  //   - https://mercurial-scm.org/repo/evolve
+  //
+  //  Use of a mercurial URL requires that the package "version" value must
+  //  specify a specific tag or revision. This must not be a branch name.
+  //
+  //  - "Darcs": the "value" field is the URL of the upstream darcs repository
+  //  this package is retrieved from.
+  //  For example:
+  //   - https://hub.darcs.net/hu.dwim/hu.dwim.util
+  //
+  //  Use of a Darcs URL requires that the package "version" value must
+  //  specify a specific tag or revision. This must not be a branch name.
+  //
+  //  - "Piper": The "value" field is the URL of the upstream piper location.
+  //  This is primarily used when a package is being migrated into third_party
+  //  from elsewhere in Piper, or when a package is being newly developed in
+  //  third_party.
+  //
+  //  - "VCS": This is a generic fallback for an unlisted VCS system. The
+  // "value" field is the URL of the repository for this VCS.
+  //
+  //  - "Archive": The "value" field is the URL of the archive containing the
+  //  source code for the package, for example a zip or tgz file.
+  //
+  //  - "PrebuiltByAlphabet": This type should be used for archives of primarily
+  //  Google-owned source code (may contain non-Google-owned dependencies),
+  //  which has been built using production Google infrastructure, and copied
+  //  into third_party.
+  //
+  //  - "LocalSource": The "value" field is the URL identifying where the local
+  //  copy of the package source code can be found.
+  //  Examples:
+  //   - https://android.googlesource.com/platform/external/apache-http/
+  //
+  //  Typically, the metadata files describing a package reside in the same
+  //  directory as the source code for the package. In a few rare cases where
+  //  they are separate, the LocalSource URL identifies where to find the
+  //  source code. This only describes where to find the local copy of the
+  //  source; there should always be an additional URL describing where the
+  //  package was retrieved from.
+  //
+  //  - "Other": An identifier that does not fit any other type. This may also
+  //  indicate that the Source code was received via email or some other
+  //  out-of-band way. This is most commonly used with commercial software
+  //  received directly from the Vendor. In the case of email, the "value" field
+  //  can be used to provide additional information about how it was received.
+  optional string type = 1;
+
+  // A human readable string to indicate why a third-package package does not
+  // have this identifier type set.
+  // Example:
+  //   identifier {
+  //     type: "PyPI"
+  //     omission_reason: "Only on Git. Not published to PyPI."
+  //   }
+  optional string omission_reason = 2;
+
+  // The value of the package identifier as defined by the "type".
+  // Example:
+  //  identifier {
+  //    type: "PyPI"
+  //    value: "django"
+  //    version: "3.2.8"
+  //  }
+  optional string value = 3;
+
+  // The version associated with this package as defined by the "type".
+  // Example:
+  //  identifier {
+  //    type: "PyPI"
+  //    value: "django"
+  //    version: "3.2.8"
+  //  }
+  optional string version = 4;
+
+  // The closest version associated with this package as defined by the "type".
+  // This should only be set by automated infrastructure by applying automated
+  // heuristics, such as the closest git tag or package version from a package
+  // manifest file (e.g. pom.xml).
+  //
+  // For most identifier types, only one of `version` or `closest_version`
+  // should be set (not both). The exception is source repository types such as
+  // "Git", where `version` will refer to a git commit, and `closest_version`
+  // refers to a git tag.
+  // Example:
+  //  identifier {
+  //    type: "Git",
+  //    value: "https://github.com/my/repo"
+  //    version: "e5fa44f2b31c1fb553b6021e7360d07d5d91ff5e"
+  //    closest_version: "v1.4"
+  //  }
+  optional string closest_version = 5;
+
+  // When `true`, this Identifier represents the location from which the source
+  // code for this package was originally obtained. This should only be set for
+  // *one* Identifier in a third_party package's METADATA.
+
+  // For external packages, this is typically for the Identifier associated
+  // with the version control system or package manager that was used to
+  // check out or download the code.
+  optional bool primary_source = 6;
 }
\ No newline at end of file
diff --git a/tools/sbom/generate-sbom.py b/tools/sbom/generate-sbom.py
index b19be87..0a8f10a 100755
--- a/tools/sbom/generate-sbom.py
+++ b/tools/sbom/generate-sbom.py
@@ -82,6 +82,46 @@
   'vndk_prebuilt_shared',
 ]
 
+THIRD_PARTY_IDENTIFIER_TYPES = [
+    # Types defined in metadata_file.proto
+    'Git',
+    'SVN',
+    'Hg',
+    'Darcs',
+    'VCS',
+    'Archive',
+    'PrebuiltByAlphabet',
+    'LocalSource',
+    'Other',
+    # OSV ecosystems defined at https://ossf.github.io/osv-schema/#affectedpackage-field.
+    'Go',
+    'npm',
+    'OSS-Fuzz',
+    'PyPI',
+    'RubyGems',
+    'crates.io',
+    'Hackage',
+    'GHC',
+    'Packagist',
+    'Maven',
+    'NuGet',
+    'Linux',
+    'Debian',
+    'Alpine',
+    'Hex',
+    'Android',
+    'GitHub Actions',
+    'Pub',
+    'ConanCenter',
+    'Rocky Linux',
+    'AlmaLinux',
+    'Bitnami',
+    'Photon OS',
+    'CRAN',
+    'Bioconductor',
+    'SwiftURL'
+]
+
 
 def get_args():
   parser = argparse.ArgumentParser()
@@ -360,6 +400,20 @@
   return True
 
 
+# Validate identifiers in a package's METADATA.
+# 1) Only known identifier type is allowed
+# 2) Only one identifier's primary_source can be true
+def validate_package_metadata(metadata_file_path, package_metadata):
+  primary_source_found = False
+  for identifier in package_metadata.third_party.identifier:
+    if identifier.type not in THIRD_PARTY_IDENTIFIER_TYPES:
+      sys.exit(f'Unknown value of third_party.identifier.type in {metadata_file_path}/METADATA: {identifier.type}.')
+    if primary_source_found and identifier.primary_source:
+      sys.exit(
+        f'Field "primary_source" is set to true in multiple third_party.identifier in {metadata_file_path}/METADATA.')
+    primary_source_found = identifier.primary_source
+
+
 def report_metadata_file(metadata_file_path, installed_file_metadata, report):
   if metadata_file_path:
     report[INFO_METADATA_FOUND_FOR_PACKAGE].append(
@@ -372,6 +426,8 @@
     with open(metadata_file_path + '/METADATA', 'rt') as f:
       text_format.Parse(f.read(), package_metadata)
 
+    validate_package_metadata(metadata_file_path, package_metadata)
+
     if not metadata_file_path in metadata_file_protos:
       metadata_file_protos[metadata_file_path] = package_metadata
       if not package_metadata.name: