aboutsummaryrefslogtreecommitdiff
path: root/sanitizers/src/main/java/com/code_intelligence/jazzer/sanitizers/RegexRoadblocks.java
blob: 1043ac02a884ca5053b916b6116e2bfacee51ce4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
// Copyright 2022 Code Intelligence GmbH
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.code_intelligence.jazzer.sanitizers;

import static com.code_intelligence.jazzer.sanitizers.utils.ReflectionUtils.INVALID_OFFSET;
import static com.code_intelligence.jazzer.sanitizers.utils.ReflectionUtils.field;
import static com.code_intelligence.jazzer.sanitizers.utils.ReflectionUtils.nestedClass;
import static com.code_intelligence.jazzer.sanitizers.utils.ReflectionUtils.offset;

import com.code_intelligence.jazzer.api.HookType;
import com.code_intelligence.jazzer.api.Jazzer;
import com.code_intelligence.jazzer.api.MethodHook;
import com.code_intelligence.jazzer.runtime.UnsafeProvider;
import java.lang.invoke.MethodHandle;
import java.util.WeakHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import sun.misc.Unsafe;

/**
 * The hooks in this class extend the reach of Jazzer's string compare instrumentation to literals
 * (both strings and characters) that are part of regular expression patterns.
 * <p>
 * Internally, the Java standard library represents a compiled regular expression as a graph of
 * instances of Pattern$Node instances, each of which represents a single unit of the full
 * expression and provides a `match` function that takes a {@link Matcher}, a {@link CharSequence}
 * to match against and an index into the sequence. With a hook on this method for every subclass of
 * Pattern$Node, the contents of the node can be inspected and an appropriate string comparison
 * between the relevant part of the input string and the literal string can be reported.
 */
public final class RegexRoadblocks {
  // The number of characters preceding one that failed a character predicate to include in the
  // reported string comparison.
  private static final int CHARACTER_COMPARE_CONTEXT_LENGTH = 10;

  private static final Unsafe UNSAFE = UnsafeProvider.getUnsafe();
  private static final Class<?> SLICE_NODE = nestedClass(Pattern.class, "SliceNode");
  private static final long SLICE_NODE_BUFFER_OFFSET =
      offset(UNSAFE, field(SLICE_NODE, "buffer", int[].class));
  private static final Class<?> CHAR_PREDICATE = nestedClass(Pattern.class, "CharPredicate");
  private static final Class<?> CHAR_PROPERTY = nestedClass(Pattern.class, "CharProperty");
  private static final long CHAR_PROPERTY_PREDICATE_OFFSET = offset(
      UNSAFE, field(CHAR_PROPERTY, "predicate", nestedClass(Pattern.class, "CharPredicate")));
  private static final Class<?> BIT_CLASS = nestedClass(Pattern.class, "BitClass");
  private static final long BIT_CLASS_BITS_OFFSET =
      offset(UNSAFE, field(BIT_CLASS, "bits", boolean[].class));

  // Weakly map CharPredicate instances to characters that satisfy the predicate. Since
  // CharPredicate instances are usually lambdas, we collect their solutions by hooking the
  // functions constructing them rather than extracting the solutions via reflection.
  // Note: Java 8 uses anonymous subclasses of CharProperty instead of lambdas implementing
  // CharPredicate, hence CharProperty instances are used as keys instead in that case.
  private static final ThreadLocal<WeakHashMap<Object, Character>> PREDICATE_SOLUTIONS =
      ThreadLocal.withInitial(WeakHashMap::new);

  // Do not act on instrumented regexes used by Jazzer internally, e.g. by ClassGraph.
  private static boolean HOOK_DISABLED = true;

  static {
    Jazzer.onFuzzTargetReady(() -> HOOK_DISABLED = UNSAFE == null);
  }

  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$Node",
      targetMethod = "match",
      targetMethodDescriptor = "(Ljava/util/regex/Matcher;ILjava/lang/CharSequence;)Z",
      additionalClassesToHook =
          {
              "java.util.regex.Matcher",
              "java.util.regex.Pattern$BackRef",
              "java.util.regex.Pattern$Behind",
              "java.util.regex.Pattern$BehindS",
              "java.util.regex.Pattern$BmpCharProperty",
              "java.util.regex.Pattern$BmpCharPropertyGreedy",
              "java.util.regex.Pattern$BnM",
              "java.util.regex.Pattern$BnMS",
              "java.util.regex.Pattern$Bound",
              "java.util.regex.Pattern$Branch",
              "java.util.regex.Pattern$BranchConn",
              "java.util.regex.Pattern$CharProperty",
              "java.util.regex.Pattern$CharPropertyGreedy",
              "java.util.regex.Pattern$CIBackRef",
              "java.util.regex.Pattern$Caret",
              "java.util.regex.Pattern$Curly",
              "java.util.regex.Pattern$Conditional",
              "java.util.regex.Pattern$First",
              "java.util.regex.Pattern$GraphemeBound",
              "java.util.regex.Pattern$GroupCurly",
              "java.util.regex.Pattern$GroupHead",
              "java.util.regex.Pattern$GroupRef",
              "java.util.regex.Pattern$LastMatch",
              "java.util.regex.Pattern$LazyLoop",
              "java.util.regex.Pattern$LineEnding",
              "java.util.regex.Pattern$Loop",
              "java.util.regex.Pattern$Neg",
              "java.util.regex.Pattern$NFCCharProperty",
              "java.util.regex.Pattern$NotBehind",
              "java.util.regex.Pattern$NotBehindS",
              "java.util.regex.Pattern$Pos",
              "java.util.regex.Pattern$Ques",
              "java.util.regex.Pattern$Slice",
              "java.util.regex.Pattern$SliceI",
              "java.util.regex.Pattern$SliceIS",
              "java.util.regex.Pattern$SliceS",
              "java.util.regex.Pattern$SliceU",
              "java.util.regex.Pattern$Start",
              "java.util.regex.Pattern$StartS",
              "java.util.regex.Pattern$UnixCaret",
              "java.util.regex.Pattern$UnixDollar",
              "java.util.regex.Pattern$XGrapheme",
          })
  public static void
  nodeMatchHook(MethodHandle method, Object node, Object[] args, int hookId, Boolean matched) {
    if (HOOK_DISABLED || matched || node == null)
      return;
    Matcher matcher = (Matcher) args[0];
    if (matcher == null)
      return;
    int i = (int) args[1];
    CharSequence seq = (CharSequence) args[2];
    if (seq == null)
      return;

    if (SLICE_NODE != null && SLICE_NODE.isInstance(node)) {
      // The node encodes a match against a fixed string literal. Extract the literal and report a
      // comparison between it and the subsequence of seq starting at i.
      if (SLICE_NODE_BUFFER_OFFSET == INVALID_OFFSET)
        return;
      int currentLength = limitedLength(matcher.regionEnd() - i);
      String current = seq.subSequence(i, i + currentLength).toString();

      // All the subclasses of SliceNode store the literal in an int[], which we have to truncate to
      // a char[].
      int[] buffer = (int[]) UNSAFE.getObject(node, SLICE_NODE_BUFFER_OFFSET);
      char[] charBuffer = new char[limitedLength(buffer.length)];
      for (int j = 0; j < charBuffer.length; j++) {
        charBuffer[j] = (char) buffer[j];
      }
      String target = new String(charBuffer);

      Jazzer.guideTowardsEquality(current, target, perRegexId(hookId, matcher));
    } else if (CHAR_PROPERTY != null && CHAR_PROPERTY.isInstance(node)) {
      // The node encodes a match against a class of characters, which may be hard to guess unicode
      // characters. We rely on further hooks to track the relation between these nodes and
      // characters satisfying their match function since the nodes themselves encode this
      // information in lambdas, which are difficult to dissect via reflection. If we know a
      // matching character, report a one-character (plus context) string comparison.
      Object solutionKey;
      if (CHAR_PROPERTY_PREDICATE_OFFSET == INVALID_OFFSET) {
        if (CHAR_PREDICATE == null) {
          // We are likely running against JDK 8, which directly construct subclasses of
          // CharProperty rather than using lambdas implementing CharPredicate.
          solutionKey = node;
        } else {
          return;
        }
      } else {
        solutionKey = UNSAFE.getObject(node, CHAR_PROPERTY_PREDICATE_OFFSET);
      }
      if (solutionKey == null)
        return;
      Character solution = predicateSolution(solutionKey);
      if (solution == null)
        return;
      // We report a string comparison rather than an integer comparison for two reasons:
      // 1. If the characters are four byte codepoints, they will be coded on six bytes (a surrogate
      //    pair) in CESU-8, which is the encoding assumed for the fuzzer input, whereas ASCII
      //    characters will be coded on a single byte. By using the string compare hook, we do not
      //    have to worry about the encoding at this point.
      // 2. The same character can appear multiple times in both the pattern and the matched string,
      //    which makes it harder for the fuzzer to determine the correct position to mutate the
      //    current character into the matching character. By providing a short section of the
      //    input string preceding the incorrect character, we increase the chance of a hit.
      String context =
          seq.subSequence(Math.max(0, i - CHARACTER_COMPARE_CONTEXT_LENGTH), i).toString();
      String current = seq.subSequence(i, Math.min(i + 1, matcher.regionEnd())).toString();
      String target = Character.toString(solution);
      Jazzer.guideTowardsEquality(context + current, context + target, perRegexId(hookId, matcher));
    }
  }

  // This and all following hooks track the relation between a CharPredicate or CharProperty
  // instance and a character that matches it. We use an after hook on the factory methods so that
  // we have access to the parameters and the created instance at the same time.
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "Single",
      targetMethodDescriptor = "(I)Ljava/util/regex/Pattern$BmpCharPredicate;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "SingleI",
      targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharPredicate;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "SingleS",
      targetMethodDescriptor = "(I)Ljava/util/regex/Pattern$CharPredicate;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "SingleU",
      targetMethodDescriptor = "(I)Ljava/util/regex/Pattern$CharPredicate;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  public static void
  singleHook(MethodHandle method, Object node, Object[] args, int hookId, Object predicate) {
    if (HOOK_DISABLED || predicate == null)
      return;
    PREDICATE_SOLUTIONS.get().put(predicate, (char) (int) args[0]);
  }

  // Java 8 uses classes extending CharProperty instead of lambdas implementing CharPredicate to
  // match single characters, so also hook those.
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$Single",
      targetMethod = "<init>", additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$SingleI",
      targetMethod = "<init>", additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$SingleS",
      targetMethod = "<init>", additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$SingleU",
      targetMethod = "<init>", additionalClassesToHook = {"java.util.regex.Pattern"})
  public static void
  java8SingleHook(
      MethodHandle method, Object property, Object[] args, int hookId, Object alwaysNull) {
    if (HOOK_DISABLED || property == null)
      return;
    PREDICATE_SOLUTIONS.get().put(property, (char) (int) args[0]);
  }

  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "Range",
      targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharPredicate;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "CIRange",
      targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharPredicate;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "CIRangeU",
      targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharPredicate;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  // Java 8 uses anonymous classes extending CharProperty instead of lambdas implementing
  // CharPredicate to match single characters, so also hook those.
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "rangeFor",
      targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharProperty;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "caseInsensitiveRangeFor",
      targetMethodDescriptor = "(II)Ljava/util/regex/Pattern$CharProperty;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  public static void
  rangeHook(MethodHandle method, Object node, Object[] args, int hookId, Object predicate) {
    if (HOOK_DISABLED || predicate == null)
      return;
    PREDICATE_SOLUTIONS.get().put(predicate, (char) (int) args[0]);
  }

  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern$CharPredicate",
      targetMethod = "union",
      targetMethodDescriptor =
          "(Ljava/util/regex/Pattern$CharPredicate;)Ljava/util/regex/Pattern$CharPredicate;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  // Java 8 uses anonymous classes extending CharProperty instead of lambdas implementing
  // CharPredicate to match single characters, so also hook union for those. Even though the classes
  // of the parameters will be different, the actual implementation of the hook is the same in this
  // case.
  @MethodHook(type = HookType.AFTER, targetClassName = "java.util.regex.Pattern",
      targetMethod = "union",
      targetMethodDescriptor =
          "(Ljava/util/regex/Pattern$CharProperty;Ljava/util/regex/Pattern$CharProperty;)Ljava/util/regex/Pattern$CharProperty;",
      additionalClassesToHook = {"java.util.regex.Pattern"})
  public static void
  unionHook(
      MethodHandle method, Object thisObject, Object[] args, int hookId, Object unionPredicate) {
    if (HOOK_DISABLED || unionPredicate == null)
      return;
    Character solution = predicateSolution(thisObject);
    if (solution == null)
      solution = predicateSolution(args[0]);
    if (solution == null)
      return;
    PREDICATE_SOLUTIONS.get().put(unionPredicate, solution);
  }

  private static Character predicateSolution(Object charPredicate) {
    return PREDICATE_SOLUTIONS.get().computeIfAbsent(charPredicate, unused -> {
      if (BIT_CLASS != null && BIT_CLASS.isInstance(charPredicate)) {
        // BitClass instances have an empty bits array at construction time, so we scan their
        // constants lazily when needed.
        boolean[] bits = (boolean[]) UNSAFE.getObject(charPredicate, BIT_CLASS_BITS_OFFSET);
        for (int i = 0; i < bits.length; i++) {
          if (bits[i]) {
            PREDICATE_SOLUTIONS.get().put(charPredicate, (char) i);
            return (char) i;
          }
        }
      }
      return null;
    });
  }

  // Limits a length to the maximum length libFuzzer will read up to in a callback.
  private static int limitedLength(int length) {
    return Math.min(length, 64);
  }

  // hookId only takes one distinct value per Node subclass. In order to get different regex matches
  // to be tracked similar to different instances of string compares, we mix in the hash of the
  // underlying pattern. We expect patterns to be static almost always, so that this should not fill
  // up the value profile map too quickly.
  private static int perRegexId(int hookId, Matcher matcher) {
    return hookId ^ matcher.pattern().toString().hashCode();
  }
}