aboutsummaryrefslogtreecommitdiff
path: root/sanitizers/src/main/java/com/code_intelligence/jazzer/sanitizers/RegexRoadblocks.java
diff options
context:
space:
mode:
Diffstat (limited to 'sanitizers/src/main/java/com/code_intelligence/jazzer/sanitizers/RegexRoadblocks.java')
-rw-r--r--sanitizers/src/main/java/com/code_intelligence/jazzer/sanitizers/RegexRoadblocks.java322
1 files changed, 322 insertions, 0 deletions
diff --git a/sanitizers/src/main/java/com/code_intelligence/jazzer/sanitizers/RegexRoadblocks.java b/sanitizers/src/main/java/com/code_intelligence/jazzer/sanitizers/RegexRoadblocks.java
new file mode 100644
index 00000000..1043ac02
--- /dev/null
+++ b/sanitizers/src/main/java/com/code_intelligence/jazzer/sanitizers/RegexRoadblocks.java
@@ -0,0 +1,322 @@
+// Copyright 2022 Code Intelligence GmbH
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.code_intelligence.jazzer.sanitizers;
+
+import static com.code_intelligence.jazzer.sanitizers.utils.ReflectionUtils.INVALID_OFFSET;
+import static com.code_intelligence.jazzer.sanitizers.utils.ReflectionUtils.field;
+import static com.code_intelligence.jazzer.sanitizers.utils.ReflectionUtils.nestedClass;
+import static com.code_intelligence.jazzer.sanitizers.utils.ReflectionUtils.offset;
+
+import com.code_intelligence.jazzer.api.HookType;
+import com.code_intelligence.jazzer.api.Jazzer;
+import com.code_intelligence.jazzer.api.MethodHook;
+import com.code_intelligence.jazzer.runtime.UnsafeProvider;
+import java.lang.invoke.MethodHandle;
+import java.util.WeakHashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import sun.misc.Unsafe;
+
+/**
+ * The hooks in this class extend the reach of Jazzer's string compare instrumentation to literals
+ * (both strings and characters) that are part of regular expression patterns.
+ * <p>
+ * Internally, the Java standard library represents a compiled regular expression as a graph of
+ * instances of Pattern$Node instances, each of which represents a single unit of the full
+ * expression and provides a `match` function that takes a {@link Matcher}, a {@link CharSequence}
+ * to match against and an index into the sequence. With a hook on this method for every subclass of
+ * Pattern$Node, the contents of the node can be inspected and an appropriate string comparison
+ * between the relevant part of the input string and the literal string can be reported.
+ */
+public final class RegexRoadblocks {
+ // The number of characters preceding one that failed a character predicate to include in the
+ // reported string comparison.
+ private static final int CHARACTER_COMPARE_CONTEXT_LENGTH = 10;
+
+ private static final Unsafe UNSAFE = UnsafeProvider.getUnsafe();
+ private static final Class<?> SLICE_NODE = nestedClass(Pattern.class, "SliceNode");
+ private static final long SLICE_NODE_BUFFER_OFFSET =
+ offset(UNSAFE, field(SLICE_NODE, "buffer", int[].class));
+ private static final Class<?> CHAR_PREDICATE = nestedClass(Pattern.class, "CharPredicate");
+ private static final Class<?> CHAR_PROPERTY = nestedClass(Pattern.class, "CharProperty");
+ private static final long CHAR_PROPERTY_PREDICATE_OFFSET = offset(
+ UNSAFE, field(CHAR_PROPERTY, "predicate", nestedClass(Pattern.class, "CharPredicate")));
+ private static final Class<?> BIT_CLASS = nestedClass(Pattern.class, "BitClass");
+ private static final long BIT_CLASS_BITS_OFFSET =
+ offset(UNSAFE, field(BIT_CLASS, "bits", boolean[].class));
+
+ // Weakly map CharPredicate instances to characters that satisfy the predicate. Since
+ // CharPredicate instances are usually lambdas, we collect their solutions by hooking the
+ // functions constructing them rather than extracting the solutions via reflection.
+ // Note: Java 8 uses anonymous subclasses of CharProperty instead of lambdas implementing
+ // CharPredicate, hence CharProperty instances are used as keys instead in that case.
+ private static final ThreadLocal<WeakHashMap<Object, Character>> PREDICATE_SOLUTIONS =
+ ThreadLocal.withInitial(WeakHashMap::new);
+
+ // Do not act on instrumented regexes used by Jazzer internally, e.g. by ClassGraph.
+ private static boolean HOOK_DISABLED = true;
+
+ static {
+ Jazzer.onFuzzTargetReady(() -> HOOK_DISABLED = UNSAFE == null);
+ }
+
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$Node",
+ targetMethod = "match",
+ targetMethodDescriptor = "(Ljava/util/regex/Matcher;ILjava/lang/CharSequence;)Z",
+ additionalClassesToHook =
+ {
+ "java.util.regex.Matcher",
+ "java.util.regex.Pattern$BackRef",
+ "java.util.regex.Pattern$Behind",
+ "java.util.regex.Pattern$BehindS",
+ "java.util.regex.Pattern$BmpCharProperty",
+ "java.util.regex.Pattern$BmpCharPropertyGreedy",
+ "java.util.regex.Pattern$BnM",
+ "java.util.regex.Pattern$BnMS",
+ "java.util.regex.Pattern$Bound",
+ "java.util.regex.Pattern$Branch",
+ "java.util.regex.Pattern$BranchConn",
+ "java.util.regex.Pattern$CharProperty",
+ "java.util.regex.Pattern$CharPropertyGreedy",
+ "java.util.regex.Pattern$CIBackRef",
+ "java.util.regex.Pattern$Caret",
+ "java.util.regex.Pattern$Curly",
+ "java.util.regex.Pattern$Conditional",
+ "java.util.regex.Pattern$First",
+ "java.util.regex.Pattern$GraphemeBound",
+ "java.util.regex.Pattern$GroupCurly",
+ "java.util.regex.Pattern$GroupHead",
+ "java.util.regex.Pattern$GroupRef",
+ "java.util.regex.Pattern$LastMatch",
+ "java.util.regex.Pattern$LazyLoop",
+ "java.util.regex.Pattern$LineEnding",
+ "java.util.regex.Pattern$Loop",
+ "java.util.regex.Pattern$Neg",
+ "java.util.regex.Pattern$NFCCharProperty",
+ "java.util.regex.Pattern$NotBehind",
+ "java.util.regex.Pattern$NotBehindS",
+ "java.util.regex.Pattern$Pos",
+ "java.util.regex.Pattern$Ques",
+ "java.util.regex.Pattern$Slice",
+ "java.util.regex.Pattern$SliceI",
+ "java.util.regex.Pattern$SliceIS",
+ "java.util.regex.Pattern$SliceS",
+ "java.util.regex.Pattern$SliceU",
+ "java.util.regex.Pattern$Start",
+ "java.util.regex.Pattern$StartS",
+ "java.util.regex.Pattern$UnixCaret",
+ "java.util.regex.Pattern$UnixDollar",
+ "java.util.regex.Pattern$XGrapheme",
+ })
+ public static void
+ nodeMatchHook(MethodHandle method, Object node, Object[] args, int hookId, Boolean matched) {
+ if (HOOK_DISABLED || matched || node == null)
+ return;
+ Matcher matcher = (Matcher) args[0];
+ if (matcher == null)
+ return;
+ int i = (int) args[1];
+ CharSequence seq = (CharSequence) args[2];
+ if (seq == null)
+ return;
+
+ if (SLICE_NODE != null && SLICE_NODE.isInstance(node)) {
+ // The node encodes a match against a fixed string literal. Extract the literal and report a
+ // comparison between it and the subsequence of seq starting at i.
+ if (SLICE_NODE_BUFFER_OFFSET == INVALID_OFFSET)
+ return;
+ int currentLength = limitedLength(matcher.regionEnd() - i);
+ String current = seq.subSequence(i, i + currentLength).toString();
+
+ // All the subclasses of SliceNode store the literal in an int[], which we have to truncate to
+ // a char[].
+ int[] buffer = (int[]) UNSAFE.getObject(node, SLICE_NODE_BUFFER_OFFSET);
+ char[] charBuffer = new char[limitedLength(buffer.length)];
+ for (int j = 0; j < charBuffer.length; j++) {
+ charBuffer[j] = (char) buffer[j];
+ }
+ String target = new String(charBuffer);
+
+ Jazzer.guideTowardsEquality(current, target, perRegexId(hookId, matcher));
+ } else if (CHAR_PROPERTY != null && CHAR_PROPERTY.isInstance(node)) {
+ // The node encodes a match against a class of characters, which may be hard to guess unicode
+ // characters. We rely on further hooks to track the relation between these nodes and
+ // characters satisfying their match function since the nodes themselves encode this
+ // information in lambdas, which are difficult to dissect via reflection. If we know a
+ // matching character, report a one-character (plus context) string comparison.
+ Object solutionKey;
+ if (CHAR_PROPERTY_PREDICATE_OFFSET == INVALID_OFFSET) {
+ if (CHAR_PREDICATE == null) {
+ // We are likely running against JDK 8, which directly construct subclasses of
+ // CharProperty rather than using lambdas implementing CharPredicate.
+ solutionKey = node;
+ } else {
+ return;
+ }
+ } else {
+ solutionKey = UNSAFE.getObject(node, CHAR_PROPERTY_PREDICATE_OFFSET);
+ }
+ if (solutionKey == null)
+ return;
+ Character solution = predicateSolution(solutionKey);
+ if (solution == null)
+ return;
+ // We report a string comparison rather than an integer comparison for two reasons:
+ // 1. If the characters are four byte codepoints, they will be coded on six bytes (a surrogate
+ // pair) in CESU-8, which is the encoding assumed for the fuzzer input, whereas ASCII
+ // characters will be coded on a single byte. By using the string compare hook, we do not
+ // have to worry about the encoding at this point.
+ // 2. The same character can appear multiple times in both the pattern and the matched string,
+ // which makes it harder for the fuzzer to determine the correct position to mutate the
+ // current character into the matching character. By providing a short section of the
+ // input string preceding the incorrect character, we increase the chance of a hit.
+ String context =
+ seq.subSequence(Math.max(0, i - CHARACTER_COMPARE_CONTEXT_LENGTH), i).toString();
+ String current = seq.subSequence(i, Math.min(i + 1, matcher.regionEnd())).toString();
+ String target = Character.toString(solution);
+ Jazzer.guideTowardsEquality(context + current, context + target, perRegexId(hookId, matcher));
+ }
+ }
+
+ // This and all following hooks track the relation between a CharPredicate or CharProperty
+ // instance and a character that matches it. We use an after hook on the factory methods so that
+ // we have access to the parameters and the created instance at the same time.
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "Single",
+ targetMethodDescriptor = "(I)Ljava/util/regex/Pattern$BmpCharPredicate;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "SingleI",
+ targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharPredicate;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "SingleS",
+ targetMethodDescriptor = "(I)Ljava/util/regex/Pattern$CharPredicate;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "SingleU",
+ targetMethodDescriptor = "(I)Ljava/util/regex/Pattern$CharPredicate;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ public static void
+ singleHook(MethodHandle method, Object node, Object[] args, int hookId, Object predicate) {
+ if (HOOK_DISABLED || predicate == null)
+ return;
+ PREDICATE_SOLUTIONS.get().put(predicate, (char) (int) args[0]);
+ }
+
+ // Java 8 uses classes extending CharProperty instead of lambdas implementing CharPredicate to
+ // match single characters, so also hook those.
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$Single",
+ targetMethod = "<init>", additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$SingleI",
+ targetMethod = "<init>", additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$SingleS",
+ targetMethod = "<init>", additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$SingleU",
+ targetMethod = "<init>", additionalClassesToHook = {"java.util.regex.Pattern"})
+ public static void
+ java8SingleHook(
+ MethodHandle method, Object property, Object[] args, int hookId, Object alwaysNull) {
+ if (HOOK_DISABLED || property == null)
+ return;
+ PREDICATE_SOLUTIONS.get().put(property, (char) (int) args[0]);
+ }
+
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "Range",
+ targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharPredicate;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "CIRange",
+ targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharPredicate;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "CIRangeU",
+ targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharPredicate;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ // Java 8 uses anonymous classes extending CharProperty instead of lambdas implementing
+ // CharPredicate to match single characters, so also hook those.
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "rangeFor",
+ targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharProperty;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "caseInsensitiveRangeFor",
+ targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharProperty;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ public static void
+ rangeHook(MethodHandle method, Object node, Object[] args, int hookId, Object predicate) {
+ if (HOOK_DISABLED || predicate == null)
+ return;
+ PREDICATE_SOLUTIONS.get().put(predicate, (char) (int) args[0]);
+ }
+
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$CharPredicate",
+ targetMethod = "union",
+ targetMethodDescriptor =
+ "(Ljava/util/regex/Pattern$CharPredicate;)Ljava/util/regex/Pattern$CharPredicate;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ // Java 8 uses anonymous classes extending CharProperty instead of lambdas implementing
+ // CharPredicate to match single characters, so also hook union for those. Even though the classes
+ // of the parameters will be different, the actual implementation of the hook is the same in this
+ // case.
+ @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
+ targetMethod = "union",
+ targetMethodDescriptor =
+ "(Ljava/util/regex/Pattern$CharProperty;Ljava/util/regex/Pattern$CharProperty;)Ljava/util/regex/Pattern$CharProperty;",
+ additionalClassesToHook = {"java.util.regex.Pattern"})
+ public static void
+ unionHook(
+ MethodHandle method, Object thisObject, Object[] args, int hookId, Object unionPredicate) {
+ if (HOOK_DISABLED || unionPredicate == null)
+ return;
+ Character solution = predicateSolution(thisObject);
+ if (solution == null)
+ solution = predicateSolution(args[0]);
+ if (solution == null)
+ return;
+ PREDICATE_SOLUTIONS.get().put(unionPredicate, solution);
+ }
+
+ private static Character predicateSolution(Object charPredicate) {
+ return PREDICATE_SOLUTIONS.get().computeIfAbsent(charPredicate, unused -> {
+ if (BIT_CLASS != null && BIT_CLASS.isInstance(charPredicate)) {
+ // BitClass instances have an empty bits array at construction time, so we scan their
+ // constants lazily when needed.
+ boolean[] bits = (boolean[]) UNSAFE.getObject(charPredicate, BIT_CLASS_BITS_OFFSET);
+ for (int i = 0; i < bits.length; i++) {
+ if (bits[i]) {
+ PREDICATE_SOLUTIONS.get().put(charPredicate, (char) i);
+ return (char) i;
+ }
+ }
+ }
+ return null;
+ });
+ }
+
+ // Limits a length to the maximum length libFuzzer will read up to in a callback.
+ private static int limitedLength(int length) {
+ return Math.min(length, 64);
+ }
+
+ // hookId only takes one distinct value per Node subclass. In order to get different regex matches
+ // to be tracked similar to different instances of string compares, we mix in the hash of the
+ // underlying pattern. We expect patterns to be static almost always, so that this should not fill
+ // up the value profile map too quickly.
+ private static int perRegexId(int hookId, Matcher matcher) {
+ return hookId ^ matcher.pattern().toString().hashCode();
+ }
+}