aboutsummaryrefslogtreecommitdiff
path: root/llvm_tools/fetch_cq_size_diff.py
blob: a20f7396aae052b15c28fef51f3716f8a8edae5c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/env python3
# Copyright 2024 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Fetches the size diff between two images on gs://.

If given a CL, this will autodetect a passing CQ builder on that CL and find
a corresponding release build for said CQ builder. The sizes of these images
will be compared.

**Please note** that there's often version skew between release builds and CQ
builds. While this skew shouldn't result in _huge_ binary size differences,
it can still account for a few MB of diff in an average case.
"""

import abc
import argparse
import dataclasses
import json
import logging
import os
from pathlib import Path
import subprocess
import sys
import tempfile
from typing import List, Optional, Tuple

import cros_cls


@dataclasses.dataclass(frozen=True)
class SizeDiffInfo:
    """Holds information about a size difference."""

    baseline_size_bytes: int
    new_size_bytes: int


class ComparableArtifact(abc.ABC):
    """Artifacts from CQ runs that can be compared."""

    @property
    @abc.abstractmethod
    def artifact_name(self) -> str:
        """Returns the name of the artifact in gs:// e.g., "image.zip"."""

    @abc.abstractmethod
    def _measure_artifact_size(self, file: Path) -> int:
        """Given a path to the artifact, extract the relevant size info.

        The directory that `file` is in may be mutated by this function. No
        guarantees are made about the state of said directory after execution
        finishes, except that `file` should remain unmodified.
        """

    def _download_and_measure_size(self, gs_url: str) -> int:
        with tempfile.TemporaryDirectory(
            prefix="fetch_size_diff_"
        ) as tempdir_str:
            into = Path(tempdir_str)
            local_file = into / os.path.basename(gs_url)
            subprocess.run(
                ["gsutil", "cp", gs_url, local_file],
                check=True,
                stdin=subprocess.DEVNULL,
            )
            return self._measure_artifact_size(local_file)

    def compare_size_from_gs(self, baseline: str, new: str) -> SizeDiffInfo:
        return SizeDiffInfo(
            baseline_size_bytes=self._download_and_measure_size(baseline),
            new_size_bytes=self._download_and_measure_size(new),
        )


class DebugInfoArtifact(ComparableArtifact):
    """ComparableArtifact instance for debuginfo."""

    @property
    def artifact_name(self) -> str:
        return "debug.tgz"

    def _measure_artifact_size(self, file: Path) -> int:
        chrome_debug = "./opt/google/chrome/chrome.debug"
        logging.info("Unpacking debuginfo...")
        subprocess.run(
            ["tar", "xaf", file, chrome_debug],
            check=True,
            cwd=file.parent,
            stdin=subprocess.DEVNULL,
        )
        return os.path.getsize(file.parent / chrome_debug)


class ImageSizeArtifact(ComparableArtifact):
    """ComparableArtifact instance for image files."""

    @property
    def artifact_name(self) -> str:
        return "image.zip"

    def _measure_artifact_size(self, file: Path) -> int:
        binpkg_sizes_name = "chromiumos_base_image.bin-package-sizes.json"
        subprocess.run(
            [
                "unzip",
                file.name,
                binpkg_sizes_name,
            ],
            check=True,
            cwd=file.parent,
            stdin=subprocess.DEVNULL,
        )
        with (file.parent / binpkg_sizes_name).open(encoding="utf-8") as f:
            loaded = json.load(f)
            try:
                size = loaded["total_size"]
            except KeyError:
                raise ValueError(f"Missing total_size in {loaded.keys()}")

            if not isinstance(size, int):
                raise ValueError(
                    f"total_size was unexpectedly {type(size)}: {size}"
                )
            return size


def is_probably_non_production_builder(builder_name: str) -> bool:
    """Quickly determine if a builder doesn't represent a board in production.

    Note that this is a heuristic; results should be taken as mostly accurate.
    """
    return any(
        x in builder_name
        for x in (
            "-asan-",
            "-buildtest-",
            "-fuzzer-",
            "-kernelnext-",
            "-ubsan-",
            "-vmtest-",
        )
    )


def guess_release_artifact_path(artifact_link: str) -> Optional[str]:
    """Guesses a close-enough release path for a CQ artifact.

    Returns:
        A path to the release artifact. Returns None if the given image_zip
        wasn't generated by a CQ builder.

    >>> guess_release_artifact_path("gs://chromeos-image-archive/brya-cq/"
        "R121-15677.0.0-90523-8764532770258575633/image.zip")
    "gs://chromeos-image-archive/brya-release/R121-15677.0.0/image.zip"
    """
    artifacts_link = os.path.dirname(artifact_link)
    release_version = cros_cls.parse_release_from_builder_artifacts_link(
        artifacts_link
    )
    # Scrape the board name from a level above the artifacts directory.
    builder = os.path.basename(os.path.dirname(artifacts_link))
    if not builder.endswith("-cq"):
        return None
    board = builder[:-3]
    return (
        f"gs://chromeos-image-archive/{board}-release/{release_version}/"
        f"{os.path.basename(artifact_link)}"
    )


def try_gsutil_ls(paths: List[str]) -> List[str]:
    """Returns all of the paths `gsutil` matches from `paths`.

    Ignores errors from gsutil about paths not existing.
    """
    result = subprocess.run(
        ["gsutil", "-m", "ls"] + paths,
        # If any URI doesn't exist, gsutil will fail. Ignore the failure.
        check=False,
        encoding="utf-8",
        stdin=subprocess.DEVNULL,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    if result.returncode:
        # Ensure the error message is what's expected, rather than e.g.,
        # invalid credentials.
        err_msg = "CommandException: One or more URLs matched no objects"
        if err_msg not in result.stderr:
            logging.error(
                "gsutil had unexpected output; stderr: %r", result.stderr
            )
            result.check_returncode()
    return [x.strip() for x in result.stdout.splitlines()]


def find_size_diffable_cq_artifacts(
    cq_build_ids: List[cros_cls.BuildID],
    artifact_name: str,
) -> Optional[Tuple[str, str]]:
    """Searches the cq-orchestrator builds for candidates for size comparison.

    Returns:
        None if no candidates are found. Otherwise, returns a two-tuple: index
        0 is the baseline (release) artifact, index 1 is the corresponding
        artifact generated by the CQ.
    """
    for cq_build_id in cq_build_ids:
        logging.info("Inspecting CQ build %d...", cq_build_id)
        orch_output = cros_cls.CQOrchestratorOutput.fetch(cq_build_id)
        child_builder_values = cros_cls.CQBoardBuilderOutput.fetch_many(
            [
                val
                for name, val in orch_output.child_builders.items()
                if not is_probably_non_production_builder(name)
            ]
        )
        artifacts_links = [
            x.artifacts_link
            for x in child_builder_values
            if x.artifacts_link is not None
        ]
        if not artifacts_links:
            logging.info("No children of CQ run %d had artifacts", cq_build_id)
            continue

        potential_artifacts = try_gsutil_ls(
            [os.path.join(x, artifact_name) for x in artifacts_links]
        )
        if not potential_artifacts:
            logging.info(
                "No children of CQ run %d produced a(n) %s",
                cq_build_id,
                artifact_name,
            )
            continue

        logging.debug(
            "Found candidate %s files: %s", artifact_name, potential_artifacts
        )
        guessed_paths = [
            (x, guess_release_artifact_path(x)) for x in potential_artifacts
        ]
        logging.debug("Guessed corresponding artifact files: %s", guessed_paths)
        release_artifacts = try_gsutil_ls([x for _, x in guessed_paths if x])
        if not release_artifacts:
            logging.info(
                "No release %s artifacts could be found for CQ builder %d.",
                artifact_name,
                cq_build_id,
            )
            continue

        # `try_gsutil_ls` makes no ordering guarantees; always pick the min()
        # artifact here for consistency across reruns.
        selected_release_artifact = min(release_artifacts)
        logging.info("Selected release artifact: %s", selected_release_artifact)
        cq_artifact = next(
            cq_path
            for cq_path, guessed_path in guessed_paths
            if guessed_path == selected_release_artifact
        )
        return selected_release_artifact, cq_artifact
    return None


def inspect_gs_impl(
    baseline_gs_url: str, new_gs_url: str, artifact: ComparableArtifact
) -> None:
    """Compares the `image.zip`s at the given URLs, logging the results."""
    size_diff = artifact.compare_size_from_gs(baseline_gs_url, new_gs_url)
    # `%d` doesn't support `,` as a modifier, and commas make these numbers
    # much easier to read. Prefer to keep strings interpreted as format strings
    # constant.
    logging.info("Baseline size: %s", f"{size_diff.baseline_size_bytes:,}")
    logging.info("New size: %s", f"{size_diff.new_size_bytes:,}")

    diff_pct = abs(size_diff.new_size_bytes / size_diff.baseline_size_bytes) - 1
    logging.info("Diff: %.2f%%", diff_pct * 100)


def inspect_cl(opts: argparse.Namespace, artifact: ComparableArtifact) -> None:
    """Implements the `cl` subcommand of this script."""
    cq_build_ids = cros_cls.fetch_cq_orchestrator_ids(opts.cl)
    if not cq_build_ids:
        sys.exit(f"No completed cq-orchestrators found for {opts.cl}")

    # Reverse cq_build_ids so we try the newest first.
    diffable_artifacts = find_size_diffable_cq_artifacts(
        cq_build_ids, artifact.artifact_name
    )
    if not diffable_artifacts:
        sys.exit("No diffable artifacts were found")

    baseline, new = diffable_artifacts
    logging.info("Comparing %s (baseline) to %s (new)", baseline, new)
    inspect_gs_impl(baseline, new, artifact)
    logging.warning(
        "Friendly reminder: CL inspection diffs between your CL and a "
        "corresponding release build. Size differences up to a few megabytes "
        "are expected and do not necessarily indicate a size difference "
        "attributable to your CL."
    )


def inspect_gs(opts: argparse.Namespace, artifact: ComparableArtifact) -> None:
    """Implements the `gs` subcommand of this script."""
    inspect_gs_impl(opts.baseline, opts.new, artifact)


def main(argv: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    what_to_compare = parser.add_mutually_exclusive_group(required=True)
    what_to_compare.add_argument(
        "--image", action="store_true", help="Compare image.zip sizes."
    )
    what_to_compare.add_argument(
        "--debuginfo", action="store_true", help="Compare debuginfo sizes."
    )

    parser.add_argument(
        "--debug", action="store_true", help="Enable debug logging"
    )
    subparsers = parser.add_subparsers(required=True)

    cl_parser = subparsers.add_parser(
        "cl", help="Inspect a CL's CQ runs to find artifacts to compare."
    )
    cl_parser.set_defaults(func=inspect_cl)
    cl_parser.add_argument(
        "cl",
        type=cros_cls.ChangeListURL.parse_with_patch_set,
        help="CL to inspect CQ runs of. This must contain a patchset number.",
    )

    gs_parser = subparsers.add_parser(
        "gs", help="Directly compare two zip files from gs://."
    )
    gs_parser.add_argument("baseline", help="Baseline file to compare.")
    gs_parser.add_argument("new", help="New file to compare.")
    gs_parser.set_defaults(func=inspect_gs)
    opts = parser.parse_args(argv)

    logging.basicConfig(
        format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: "
        "%(message)s",
        level=logging.DEBUG if opts.debug else logging.INFO,
    )

    assert getattr(opts, "func", None), "Unknown subcommand?"
    if opts.image:
        artifact: ComparableArtifact = ImageSizeArtifact()
    else:
        assert opts.debuginfo
        artifact = DebugInfoArtifact()

    opts.func(opts, artifact)


if __name__ == "__main__":
    main(sys.argv[1:])