1 files changed, 577 insertions, 0 deletions
diff --git a/llvm_tools/werror_logs.py b/llvm_tools/werror_logs.py
new file mode 100755
index 00000000..f8a526f1
--- /dev/null
+++ b/llvm_tools/werror_logs.py
@@ -0,0 +1,577 @@
+#!/usr/bin/env python3
+# Copyright 2024 The ChromiumOS Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Helps reason about -Werror logs emitted by the compiler wrapper.
+
+Specifically, this works with the -Werror reports produced by the compiler
+wrapper in FORCE_DISABLE_WERROR mode. It's intended to be run on trees of these
+reports, so devs can run roughly the following commands:
+
+$ apply_force_disable_werror  # (There's no actual script to do this today.)
+$ build_packages --board=foo --nousepkg
+$ ./werror_logs.py aggregate --directory=/build/foo/var/lib/chromeos
+
+And see a full aggregation of all warnings that were suppressed in that
+`build_packages` invocation.
+
+It can also be used to fetch warnings reports from CQ runs, for instance,
+$ ./werror_logs.py fetch-cq --cq-orchestrator-id=123456
+
+In this case, it downloads _all -Werror logs_ from children of the given
+cq-orchestrator, and prints the parent directory of all of these reports. If
+you run `aggregate` on this directory, it's highly recommended to use the
+`--canonicalize-board-roots` flag.
+"""
+
+import argparse
+import collections
+import dataclasses
+import json
+import logging
+import multiprocessing.pool
+import os
+from pathlib import Path
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+from typing import Any, Counter, DefaultDict, Dict, IO, Iterable, List, Optional
+
+import cros_cls
+
+
+_DEFAULT_FETCH_DIRECTORY = Path("/tmp/werror_logs")
+
+
+def canonicalize_file_path_board_root(file_path: str) -> str:
+    # Get rid of double slashes, unnecessary directory traversal
+    # (foo/../bar/..), etc. Easier to read this way.
+    file_path = os.path.normpath(file_path)
+    if file_path.startswith("/build/"):
+        i = file_path.find("/", len("/build/"))
+        if i != -1:
+            return f"/build/{{board}}/{file_path[i+1:]}"
+    return file_path
+
+
+@dataclasses.dataclass(frozen=True, eq=True, order=True)
+class ClangWarningLocation:
+    """Represents a location at which a Clang warning was emitted."""
+
+    file: str
+    line: int
+    column: int
+
+    @classmethod
+    def parse(
+        cls, location: str, canonicalize_board_root: bool = False
+    ) -> "ClangWarningLocation":
+        split = location.rsplit(":", 2)
+        if len(split) == 3:
+            file = split[0]
+            if canonicalize_board_root:
+                file = canonicalize_file_path_board_root(file)
+            return cls(file=file, line=int(split[1]), column=int(split[2]))
+        raise ValueError(f"Invalid location: {location!r}")
+
+
+@dataclasses.dataclass(frozen=True, eq=True)
+class ClangWarning:
+    """Represents a Clang warning at a specific location (if applicable)."""
+
+    # The name of the warning, e.g., -Wunused-variable
+    name: str
+    # The message of the warning, e.g., "'allocate' is deprecated."
+    message: str
+    # The location of this warning. Not present for frontend diagnostics.
+    location: Optional[ClangWarningLocation]
+
+    # This parses two kinds of errors:
+    # 1. `clang-17: error: foo [-W...]`
+    # 2. `/file/path:123:45: error: foo [-W...]"
+    _WARNING_RE = re.compile(
+        # Capture the location on its own, since `clang-\d+` is unused below.
+        r"^(?:([^:]*:\d+:\d+)|clang-\d+)"
+        r": error: "
+        # Capture the message
+        r"(.*?)\s+"
+        r"\[(-W[^\][]+)]\s*$"
+    )
+
+    @classmethod
+    def try_parse_line(
+        cls, line: str, canonicalize_board_root: bool = False
+    ) -> Optional["ClangWarning"]:
+        # Fast path: we can expect "error: " in interesting lines. Break early
+        # if that's not present.
+        if "error: " not in line:
+            return None
+
+        m = cls._WARNING_RE.fullmatch(line)
+        if not m:
+            return None
+
+        location, message, warning_flags = m.groups()
+        individual_warning_flags = [
+            x for x in warning_flags.split(",") if x != "-Werror"
+        ]
+
+        # This isn't impossible to handle in theory, just unexpected. Complain
+        # about it.
+        if len(individual_warning_flags) != 1:
+            raise ValueError(
+                f"Weird: parsed warnings {individual_warning_flags} out "
+                f"of {line}"
+            )
+
+        if location is None:
+            parsed_location = None
+        else:
+            parsed_location = ClangWarningLocation.parse(
+                location, canonicalize_board_root
+            )
+        return cls(
+            name=individual_warning_flags[0],
+            message=message,
+            location=parsed_location,
+        )
+
+
+@dataclasses.dataclass(frozen=True, eq=True)
+class WarningInfo:
+    """Carries information about a ClangWarning."""
+
+    packages: DefaultDict[str, int] = dataclasses.field(
+        default_factory=lambda: collections.defaultdict(int)
+    )
+
+
+class UnknownPackageNameError(ValueError):
+    """Raised when a package name can't be determined from a warning report."""
+
+
+@dataclasses.dataclass
+class AggregatedWarnings:
+    """Aggregates warning reports incrementally."""
+
+    num_reports: int = 0
+    # Mapping of warning -> list of packages that emitted it. Warnings in
+    # headers may be referred to by multiple packages.
+    warnings: DefaultDict[ClangWarning, WarningInfo] = dataclasses.field(
+        default_factory=lambda: collections.defaultdict(WarningInfo)
+    )
+
+    _CWD_PACKAGE_RE = re.compile(
+        r"^(?:/build/[^/]+)?/var/(?:cache|tmp)/portage/([^/]+/[^/]+)/"
+    )
+
+    @classmethod
+    def _guess_package_name(cls, report: Dict[str, Any]) -> str:
+        """Tries to guess what package `report` is from.
+
+        Raises:
+            UnknownPackageNameError if the package's name couldn't be
+            determined.
+        """
+        m = cls._CWD_PACKAGE_RE.match(report.get("cwd", ""))
+        if not m:
+            raise UnknownPackageNameError()
+        return m.group(1)
+
+    def add_report_json(
+        self, report_json: Dict[str, Any], canonicalize_board_root: bool = False
+    ) -> int:
+        """Adds the given report, returning the number of warnings parsed.
+
+        Raises:
+            UnknownPackageNameError if the package's name couldn't be
+            determined.
+        """
+        self.num_reports += 1
+        package_name = self._guess_package_name(report_json)
+
+        num_warnings = 0
+        for line in report_json.get("stdout", "").splitlines():
+            if parsed := ClangWarning.try_parse_line(
+                line, canonicalize_board_root
+            ):
+                self.warnings[parsed].packages[package_name] += 1
+                num_warnings += 1
+
+        return num_warnings
+
+    def add_report(
+        self, report_file: Path, canonicalize_board_root: bool = False
+    ) -> None:
+        with report_file.open(encoding="utf-8") as f:
+            report = json.load(f)
+
+        try:
+            n = self.add_report_json(report, canonicalize_board_root)
+        except UnknownPackageNameError:
+            logging.warning(
+                "Failed guessing package name for report at %r; ignoring file",
+                report_file,
+            )
+            return
+
+        if not n:
+            logging.warning(
+                "Report at %r had no parseable warnings", report_file
+            )
+
+
+def print_aligned_counts(
+    name_count_map: Dict[str, int], file: Optional[IO[str]] = None
+) -> None:
+    assert name_count_map
+    # Sort on value, highest first. Name breaks ties.
+    summary = sorted(name_count_map.items(), key=lambda x: (-x[1], x[0]))
+    num_col_width = len(f"{summary[0][1]:,}")
+    name_col_width = max(len(x) for x in name_count_map)
+    for name, count in summary:
+        fmt_name = name.rjust(name_col_width)
+        fmt_count = f"{count:,}".rjust(num_col_width)
+        print(f"\t{fmt_name}: {fmt_count}", file=file)
+
+
+def summarize_per_package_warnings(
+    warning_infos: Iterable[WarningInfo],
+    file: Optional[IO[str]] = None,
+) -> None:
+    warnings_per_package: DefaultDict[str, int] = collections.defaultdict(int)
+    for info in warning_infos:
+        for package_name, warning_count in info.packages.items():
+            warnings_per_package[package_name] += warning_count
+
+    if not warnings_per_package:
+        return
+
+    print("## Per-package warning counts:", file=file)
+    print_aligned_counts(warnings_per_package, file=file)
+
+
+def summarize_warnings_by_flag(
+    warnings: Dict[ClangWarning, WarningInfo],
+    file: Optional[IO[str]] = None,
+) -> None:
+    if not warnings:
+        return
+
+    warnings_per_flag: Counter[str] = collections.Counter()
+    for warning, info in warnings.items():
+        warnings_per_flag[warning.name] += sum(info.packages.values())
+
+    print("## Instances of each fatal warning:", file=file)
+    print_aligned_counts(warnings_per_flag, file=file)
+
+
+def aggregate_reports(opts: argparse.Namespace) -> None:
+    directory = opts.directory
+    aggregated = AggregatedWarnings()
+    for report in directory.glob("**/warnings_report*.json"):
+        logging.debug("Discovered report %s", report)
+        aggregated.add_report(report, opts.canonicalize_board_roots)
+
+    if not aggregated.num_reports:
+        raise ValueError(f"Found no warnings report under {directory}")
+
+    logging.info("Discovered %d report files in total", aggregated.num_reports)
+    summarize_per_package_warnings(aggregated.warnings.values())
+    summarize_warnings_by_flag(aggregated.warnings)
+
+
+def fetch_werror_tarball_links(
+    child_builders: Dict[str, cros_cls.BuildID]
+) -> List[str]:
+    outputs = cros_cls.CQBoardBuilderOutput.fetch_many(child_builders.values())
+    artifacts_links = []
+    for builder_name, out in zip(child_builders, outputs):
+        if out.artifacts_link:
+            artifacts_links.append(out.artifacts_link)
+        else:
+            logging.info("%s had no output artifacts; ignoring", builder_name)
+
+    gsutil_stdout = subprocess.run(
+        ["gsutil", "-m", "ls"] + artifacts_links,
+        check=True,
+        encoding="utf-8",
+        stdin=subprocess.DEVNULL,
+        stdout=subprocess.PIPE,
+    ).stdout
+
+    return [
+        x
+        for x in gsutil_stdout.splitlines()
+        if x.endswith(".fatal_clang_warnings.tar.xz")
+    ]
+
+
+def cq_builder_name_from_werror_logs_path(werror_logs: str) -> str:
+    """Returns the CQ builder given a -Werror logs path.
+
+    >>> cq_builder_name_from_werror_logs_path(
+            "gs://chromeos-image-archive/staryu-cq/"
+            "R123-15771.0.0-94466-8756713501925941617/"
+            "staryu.20240207.fatal_clang_warnings.tar.xz"
+        )
+    "staryu-cq"
+    """
+    return os.path.basename(os.path.dirname(os.path.dirname(werror_logs)))
+
+
+def download_and_unpack_werror_tarballs(
+    unpack_dir: Path, download_dir: Path, gs_urls: List[str]
+):
+    # This is necessary below when we're untarring files. It should trivially
+    # always be the case, and assuming it makes testing easier.
+    assert download_dir.is_absolute(), download_dir
+
+    unpack_dir.mkdir()
+    download_dir.mkdir()
+
+    logging.info(
+        "Fetching and unpacking %d -Werror reports; this may take a bit",
+        len(gs_urls),
+    )
+    # Run the download in a threadpool since we can have >100 logs, and all of
+    # this is heavily I/O-bound.
+    # Max 8 downloads at a time is arbitrary, but should minimize the chance of
+    # rate-limiting. Don't limit `tar xaf`, since those should be short-lived.
+    download_limiter = threading.BoundedSemaphore(8)
+
+    def download_one_url(
+        unpack_dir: Path, download_dir: Path, gs_url: str
+    ) -> Optional[subprocess.CalledProcessError]:
+        """Downloads and unpacks -Werror logs from the given gs_url.
+
+        Leaves the tarball in `download_dir`, and the unpacked version in
+        `unpack_dir`.
+
+        Returns:
+            None if all went well; otherwise, returns the command that failed.
+            All commands have stderr data piped in.
+        """
+        file_targ = download_dir / os.path.basename(gs_url)
+        try:
+            with download_limiter:
+                subprocess.run(
+                    ["gsutil", "cp", gs_url, file_targ],
+                    check=True,
+                    stdin=subprocess.DEVNULL,
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.PIPE,
+                    encoding="utf-8",
+                    errors="replace",
+                )
+
+            # N.B., file_targ is absolute, so running with `file_targ` while
+            # changing `cwd` is safe.
+            subprocess.run(
+                ["tar", "xaf", file_targ],
+                check=True,
+                cwd=unpack_dir,
+                stdin=subprocess.DEVNULL,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
+                encoding="utf-8",
+                errors="replace",
+            )
+        except subprocess.CalledProcessError as e:
+            return e
+        return None
+
+    with multiprocessing.pool.ThreadPool() as thread_pool:
+        download_futures = []
+        for gs_url in gs_urls:
+            name = cq_builder_name_from_werror_logs_path(gs_url)
+            unpack_to = unpack_dir / name
+            unpack_to.mkdir()
+            download_to = download_dir / name
+            download_to.mkdir()
+            download_futures.append(
+                (
+                    name,
+                    thread_pool.apply_async(
+                        download_one_url, (unpack_to, download_to, gs_url)
+                    ),
+                )
+            )
+
+        num_failures = 0
+        for name, future in download_futures:
+            result = future.get()
+            if not result:
+                continue
+
+            num_failures += 1
+            logging.error(
+                "Downloading %s failed: running %r. Stderr: %r",
+                name,
+                result.cmd,
+                result.stderr,
+            )
+    if num_failures:
+        raise ValueError(f"{num_failures} download(s) failed.")
+
+
+def fetch_cq_reports(opts: argparse.Namespace) -> None:
+    if opts.cl:
+        logging.info(
+            "Fetching most recent completed CQ orchestrator from %s", opts.cl
+        )
+        all_ids = cros_cls.fetch_cq_orchestrator_ids(opts.cl)
+        if not all_ids:
+            raise ValueError(
+                f"No CQ orchestrators found under {opts.cl}. See --help for "
+                "how to pass a build ID directly."
+            )
+        # Note that these cq-orchestrator runs are returned in oldest-to-newest
+        # order. The user probably wants the newest run.
+        cq_orchestrator_id = all_ids[-1]
+        cq_orchestrator_url = cros_cls.builder_url(cq_orchestrator_id)
+        logging.info("Checking CQ run %s", cq_orchestrator_url)
+    else:
+        cq_orchestrator_id = opts.cq_orchestrator_id
+        cq_orchestrator_url = cros_cls.builder_url(cq_orchestrator_id)
+
+    # This is the earliest point at which we can compute this directory with
+    # certainty. Figure it out now and fail early if it exists.
+    output_directory = opts.directory
+    if not output_directory:
+        output_directory = _DEFAULT_FETCH_DIRECTORY / str(cq_orchestrator_id)
+
+    if output_directory.exists():
+        if not opts.force:
+            sys.exit(
+                f"Directory at {output_directory} exists; not overwriting. "
+                "Pass --force to overwrite."
+            )
+        # Actually _remove_ it when we have all logs unpacked and are able to
+        # create the output directory with confidence.
+
+    logging.info("Fetching info on child builders of %s", cq_orchestrator_url)
+    child_builders = cros_cls.CQOrchestratorOutput.fetch(
+        cq_orchestrator_id
+    ).child_builders
+    if not child_builders:
+        raise ValueError(f"No child builders found for {cq_orchestrator_url}")
+
+    logging.info(
+        "%d child builders found; finding associated tarball links",
+        len(child_builders),
+    )
+    werror_links = fetch_werror_tarball_links(child_builders)
+    if not werror_links:
+        raise ValueError(
+            f"No -Werror logs found in children of {cq_orchestrator_url}"
+        )
+
+    logging.info("%d -Werror logs found", len(werror_links))
+    with tempfile.TemporaryDirectory("werror_logs_fetch_cq") as t:
+        tempdir = Path(t)
+        unpack_dir = tempdir / "unpacked"
+        download_and_unpack_werror_tarballs(
+            unpack_dir=unpack_dir,
+            download_dir=tempdir / "tarballs",
+            gs_urls=werror_links,
+        )
+
+        if output_directory.exists():
+            logging.info("Removing output directory at %s", output_directory)
+            shutil.rmtree(output_directory)
+        output_directory.parent.mkdir(parents=True, exist_ok=True)
+        # (Convert these to strs to keep mypy happy.)
+        shutil.move(str(unpack_dir), str(output_directory))
+        logging.info(
+            "CQ logs from %s stored in %s",
+            cq_orchestrator_url,
+            output_directory,
+        )
+
+
+def main(argv: List[str]) -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Enable debug logging"
+    )
+    subparsers = parser.add_subparsers(required=True)
+    # b/318833638: While there's only one subparser here for the moment, more
+    # are expected to come (specifically, one to download logs from a CQ run).
+    aggregate = subparsers.add_parser(
+        "aggregate",
+        help="""
+        Aggregate all -Werror reports beneath a directory. Note that this will
+        traverse all children of the directory, so can be used either on
+        unpacked -Werror reports from CQ builders, or can be used on e.g.,
+        /build/cherry/var/lib/chromeos.
+        """,
+    )
+    aggregate.set_defaults(func=aggregate_reports)
+    aggregate.add_argument(
+        "--canonicalize-board-roots",
+        action="store_true",
+        help="""
+        Converts warnings paths starting with a board root (e.g., /build/atlas)
+        to a form consistent across many boards.
+        """,
+    )
+    aggregate.add_argument(
+        "--directory", type=Path, required=True, help="Directory to inspect."
+    )
+
+    fetch_cq = subparsers.add_parser(
+        "fetch-cq",
+        help="Fetch all -Werror reports for a CQ run.",
+    )
+    fetch_cq.set_defaults(func=fetch_cq_reports)
+    cl_or_cq_orchestrator = fetch_cq.add_mutually_exclusive_group(required=True)
+    cl_or_cq_orchestrator.add_argument(
+        "--cl",
+        type=cros_cls.ChangeListURL.parse_with_patch_set,
+        help="Link to a CL to get the most recent cq-orchestrator from",
+    )
+    cl_or_cq_orchestrator.add_argument(
+        "--cq-orchestrator-id",
+        type=cros_cls.BuildID,
+        help="""
+        Build number for a cq-orchestrator run. Builders invoked by this are
+        examined for -Werror logs.
+        """,
+    )
+    fetch_cq.add_argument(
+        "--directory",
+        type=Path,
+        help=f"""
+        Directory to put downloaded -Werror logs in. Default is a subdirectory
+        of {_DEFAULT_FETCH_DIRECTORY}.
+        """,
+    )
+    fetch_cq.add_argument(
+        "-f",
+        "--force",
+        action="store_true",
+        help="Remove the directory at `--directory` if it exists",
+    )
+
+    opts = parser.parse_args(argv)
+
+    logging.basicConfig(
+        format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: "
+        "%(message)s",
+        level=logging.DEBUG if opts.debug else logging.INFO,
+    )
+
+    assert getattr(opts, "func", None), "Unknown subcommand?"
+    opts.func(opts)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])