# Copyright (c) 2020, 2021 The Linux Foundation # # SPDX-License-Identifier: Apache-2.0 import hashlib import os import re from west import log from zspdx.licenses import LICENSES from zspdx.util import getHashes # ScannerConfig contains settings used to configure how the SPDX # Document scanning should occur. class ScannerConfig: def __init__(self): super(ScannerConfig, self).__init__() # when assembling a Package's data, should we auto-conclude the # Package's license, based on the licenses of its Files? self.shouldConcludePackageLicense = True # when assembling a Package's Files' data, should we auto-conclude # each File's license, based on its detected license(s)? self.shouldConcludeFileLicenses = True # number of lines to scan for SPDX-License-Identifier (0 = all) # defaults to 20 self.numLinesScanned = 20 # should we calculate SHA256 hashes for each Package's Files? # note that SHA1 hashes are mandatory, per SPDX 2.2 self.doSHA256 = True # should we calculate MD5 hashes for each Package's Files? self.doMD5 = False def parseLineForExpression(line): """Return parsed SPDX expression if tag found in line, or None otherwise.""" p = line.partition("SPDX-License-Identifier:") if p[2] == "": return None # strip away trailing comment marks and whitespace, if any expression = p[2].strip() expression = expression.rstrip("/*") expression = expression.strip() return expression def getExpressionData(filePath, numLines): """ Scans the specified file for the first SPDX-License-Identifier: tag in the file. Arguments: - filePath: path to file to scan. - numLines: number of lines to scan for an expression before giving up. If 0, will scan the entire file. Returns: parsed expression if found; None if not found. """ log.dbg(f" - getting licenses for {filePath}") with open(filePath, "r") as f: try: lineno = 0 for line in f: lineno += 1 if lineno > numLines > 0: break expression = parseLineForExpression(line) if expression is not None: return expression except UnicodeDecodeError: # invalid UTF-8 content return None # if we get here, we didn't find an expression return None def splitExpression(expression): """ Parse a license expression into its constituent identifiers. Arguments: - expression: SPDX license expression Returns: array of split identifiers """ # remove parens and plus sign e2 = re.sub(r'\(|\)|\+', "", expression, flags=re.IGNORECASE) # remove word operators, ignoring case, leaving a blank space e3 = re.sub(r' AND | OR | WITH ', " ", e2, flags=re.IGNORECASE) # and split on space e4 = e3.split(" ") return sorted(e4) def calculateVerificationCode(pkg): """ Calculate the SPDX Package Verification Code for all files in the package. Arguments: - pkg: Package Returns: verification code as string """ hashes = [] for f in pkg.files.values(): hashes.append(f.sha1) hashes.sort() filelist = "".join(hashes) hSHA1 = hashlib.sha1() hSHA1.update(filelist.encode('utf-8')) return hSHA1.hexdigest() def checkLicenseValid(lic, doc): """ Check whether this license ID is a valid SPDX license ID, and add it to the custom license IDs set for this Document if it isn't. Arguments: - lic: detected license ID - doc: Document """ if lic not in LICENSES: doc.customLicenseIDs.add(lic) def getPackageLicenses(pkg): """ Extract lists of all concluded and infoInFile licenses seen. Arguments: - pkg: Package Returns: sorted list of concluded license exprs, sorted list of infoInFile ID's """ licsConcluded = set() licsFromFiles = set() for f in pkg.files.values(): licsConcluded.add(f.concludedLicense) for licInfo in f.licenseInfoInFile: licsFromFiles.add(licInfo) return sorted(list(licsConcluded)), sorted(list(licsFromFiles)) def normalizeExpression(licsConcluded): """ Combine array of license expressions into one AND'd expression, adding parens where needed. Arguments: - licsConcluded: array of license expressions Returns: string with single AND'd expression. """ # return appropriate for simple cases if len(licsConcluded) == 0: return "NOASSERTION" if len(licsConcluded) == 1: return licsConcluded[0] # more than one, so we'll need to combine them # iff an expression has spaces, it needs parens revised = [] for lic in licsConcluded: if lic in ["NONE", "NOASSERTION"]: continue if " " in lic: revised.append(f"({lic})") else: revised.append(lic) return " AND ".join(revised) def scanDocument(cfg, doc): """ Scan for licenses and calculate hashes for all Files and Packages in this Document. Arguments: - cfg: ScannerConfig - doc: Document """ for pkg in doc.pkgs.values(): log.inf(f"scanning files in package {pkg.cfg.name} in document {doc.cfg.name}") # first, gather File data for this package for f in pkg.files.values(): # set relpath based on package's relativeBaseDir f.relpath = os.path.relpath(f.abspath, pkg.cfg.relativeBaseDir) # get hashes for file hashes = getHashes(f.abspath) if not hashes: log.wrn(f"unable to get hashes for file {f.abspath}; skipping") continue hSHA1, hSHA256, hMD5 = hashes f.sha1 = hSHA1 if cfg.doSHA256: f.sha256 = hSHA256 if cfg.doMD5: f.md5 = hMD5 # get licenses for file expression = getExpressionData(f.abspath, cfg.numLinesScanned) if expression: if cfg.shouldConcludeFileLicenses: f.concludedLicense = expression f.licenseInfoInFile = splitExpression(expression) # check if any custom license IDs should be flagged for document for lic in f.licenseInfoInFile: checkLicenseValid(lic, doc) # now, assemble the Package data licsConcluded, licsFromFiles = getPackageLicenses(pkg) if cfg.shouldConcludePackageLicense: pkg.concludedLicense = normalizeExpression(licsConcluded) pkg.licenseInfoFromFiles = licsFromFiles pkg.verificationCode = calculateVerificationCode(pkg)