V8 Project
jsregexp.cc
Go to the documentation of this file.
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/v8.h"
6 
7 #include "src/ast.h"
10 #include "src/compiler.h"
11 #include "src/execution.h"
12 #include "src/factory.h"
13 #include "src/jsregexp-inl.h"
14 #include "src/jsregexp.h"
15 #include "src/ostreams.h"
16 #include "src/parser.h"
20 #include "src/regexp-stack.h"
21 #include "src/runtime/runtime.h"
22 #include "src/string-search.h"
23 
24 #ifndef V8_INTERPRETED_REGEXP
25 #if V8_TARGET_ARCH_IA32
26 #include "src/ia32/regexp-macro-assembler-ia32.h" // NOLINT
27 #elif V8_TARGET_ARCH_X64
28 #include "src/x64/regexp-macro-assembler-x64.h" // NOLINT
29 #elif V8_TARGET_ARCH_ARM64
31 #elif V8_TARGET_ARCH_ARM
32 #include "src/arm/regexp-macro-assembler-arm.h" // NOLINT
33 #elif V8_TARGET_ARCH_MIPS
34 #include "src/mips/regexp-macro-assembler-mips.h" // NOLINT
35 #elif V8_TARGET_ARCH_MIPS64
37 #elif V8_TARGET_ARCH_X87
38 #include "src/x87/regexp-macro-assembler-x87.h" // NOLINT
39 #else
40 #error Unsupported target architecture.
41 #endif
42 #endif
43 
45 
46 
47 namespace v8 {
48 namespace internal {
49 
51  Handle<JSFunction> constructor,
52  Handle<String> pattern,
54  // Call the construct code with 2 arguments.
55  Handle<Object> argv[] = { pattern, flags };
56  return Execution::New(constructor, arraysize(argv), argv);
57 }
58 
59 
61  int flags = JSRegExp::NONE;
62  for (int i = 0; i < str->length(); i++) {
63  switch (str->Get(i)) {
64  case 'i':
66  break;
67  case 'g':
69  break;
70  case 'm':
72  break;
73  case 'y':
74  if (FLAG_harmony_regexps) flags |= JSRegExp::STICKY;
75  break;
76  }
77  }
78  return JSRegExp::Flags(flags);
79 }
80 
81 
85  Handle<String> pattern,
86  Handle<String> error_text,
87  const char* message) {
88  Isolate* isolate = re->GetIsolate();
89  Factory* factory = isolate->factory();
90  Handle<FixedArray> elements = factory->NewFixedArray(2);
91  elements->set(0, *pattern);
92  elements->set(1, *error_text);
93  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
94  Handle<Object> regexp_err;
95  THROW_NEW_ERROR(isolate, NewSyntaxError(message, array), Object);
96 }
97 
98 
100  const int* ranges,
101  int ranges_length,
102  Interval new_range) {
103  DCHECK((ranges_length & 1) == 1);
104  DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
105  if (containment == kLatticeUnknown) return containment;
106  bool inside = false;
107  int last = 0;
108  for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
109  // Consider the range from last to ranges[i].
110  // We haven't got to the new range yet.
111  if (ranges[i] <= new_range.from()) continue;
112  // New range is wholly inside last-ranges[i]. Note that new_range.to() is
113  // inclusive, but the values in ranges are not.
114  if (last <= new_range.from() && new_range.to() < ranges[i]) {
115  return Combine(containment, inside ? kLatticeIn : kLatticeOut);
116  }
117  return kLatticeUnknown;
118  }
119  return containment;
120 }
121 
122 
123 // More makes code generation slower, less makes V8 benchmark score lower.
125 // In a 3-character pattern you can maximally step forwards 3 characters
126 // at a time, which is not always enough to pay for the extra logic.
128 
129 
130 // Identifies the sort of regexps where the regexp engine is faster
131 // than the code used for atom matches.
133  int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
134  if (length <= kPatternTooShortForBoyerMoore) return false;
135  const int kMod = 128;
136  bool character_found[kMod];
137  int different = 0;
138  memset(&character_found[0], 0, sizeof(character_found));
139  for (int i = 0; i < length; i++) {
140  int ch = (pattern->Get(i) & (kMod - 1));
141  if (!character_found[ch]) {
142  character_found[ch] = true;
143  different++;
144  // We declare a regexp low-alphabet if it has at least 3 times as many
145  // characters as it has different characters.
146  if (different * 3 > length) return false;
147  }
148  }
149  return true;
150 }
151 
152 
153 // Generic RegExp methods. Dispatches to implementation specific methods.
154 
155 
157  Handle<String> pattern,
158  Handle<String> flag_str) {
159  Isolate* isolate = re->GetIsolate();
160  Zone zone(isolate);
162  CompilationCache* compilation_cache = isolate->compilation_cache();
163  MaybeHandle<FixedArray> maybe_cached =
164  compilation_cache->LookupRegExp(pattern, flags);
165  Handle<FixedArray> cached;
166  bool in_cache = maybe_cached.ToHandle(&cached);
167  LOG(isolate, RegExpCompileEvent(re, in_cache));
168 
169  Handle<Object> result;
170  if (in_cache) {
171  re->set_data(*cached);
172  return re;
173  }
174  pattern = String::Flatten(pattern);
175  PostponeInterruptsScope postpone(isolate);
176  RegExpCompileData parse_result;
177  FlatStringReader reader(isolate, pattern);
178  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
179  &parse_result, &zone)) {
180  // Throw an exception if we fail to parse the pattern.
181  return ThrowRegExpException(re,
182  pattern,
183  parse_result.error,
184  "malformed_regexp");
185  }
186 
187  bool has_been_compiled = false;
188 
189  if (parse_result.simple &&
190  !flags.is_ignore_case() &&
191  !flags.is_sticky() &&
192  !HasFewDifferentCharacters(pattern)) {
193  // Parse-tree is a single atom that is equal to the pattern.
194  AtomCompile(re, pattern, flags, pattern);
195  has_been_compiled = true;
196  } else if (parse_result.tree->IsAtom() &&
197  !flags.is_ignore_case() &&
198  !flags.is_sticky() &&
199  parse_result.capture_count == 0) {
200  RegExpAtom* atom = parse_result.tree->AsAtom();
201  Vector<const uc16> atom_pattern = atom->data();
202  Handle<String> atom_string;
204  isolate, atom_string,
205  isolate->factory()->NewStringFromTwoByte(atom_pattern),
206  Object);
207  if (!HasFewDifferentCharacters(atom_string)) {
208  AtomCompile(re, pattern, flags, atom_string);
209  has_been_compiled = true;
210  }
211  }
212  if (!has_been_compiled) {
213  IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
214  }
215  DCHECK(re->data()->IsFixedArray());
216  // Compilation succeeded so the data is set on the regexp
217  // and we can store it in the cache.
218  Handle<FixedArray> data(FixedArray::cast(re->data()));
219  compilation_cache->PutRegExp(pattern, flags, data);
220 
221  return re;
222 }
223 
224 
226  Handle<String> subject,
227  int index,
228  Handle<JSArray> last_match_info) {
229  switch (regexp->TypeTag()) {
230  case JSRegExp::ATOM:
231  return AtomExec(regexp, subject, index, last_match_info);
232  case JSRegExp::IRREGEXP: {
233  return IrregexpExec(regexp, subject, index, last_match_info);
234  }
235  default:
236  UNREACHABLE();
237  return MaybeHandle<Object>();
238  }
239 }
240 
241 
242 // RegExp Atom implementation: Simple string search using indexOf.
243 
244 
246  Handle<String> pattern,
248  Handle<String> match_pattern) {
249  re->GetIsolate()->factory()->SetRegExpAtomData(re,
251  pattern,
252  flags,
253  match_pattern);
254 }
255 
256 
257 static void SetAtomLastCapture(FixedArray* array,
258  String* subject,
259  int from,
260  int to) {
261  SealHandleScope shs(array->GetIsolate());
263  RegExpImpl::SetLastSubject(array, subject);
264  RegExpImpl::SetLastInput(array, subject);
265  RegExpImpl::SetCapture(array, 0, from);
266  RegExpImpl::SetCapture(array, 1, to);
267 }
268 
269 
271  Handle<String> subject,
272  int index,
273  int32_t* output,
274  int output_size) {
275  Isolate* isolate = regexp->GetIsolate();
276 
277  DCHECK(0 <= index);
278  DCHECK(index <= subject->length());
279 
280  subject = String::Flatten(subject);
281  DisallowHeapAllocation no_gc; // ensure vectors stay valid
282 
283  String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
284  int needle_len = needle->length();
285  DCHECK(needle->IsFlat());
286  DCHECK_LT(0, needle_len);
287 
288  if (index + needle_len > subject->length()) {
289  return RegExpImpl::RE_FAILURE;
290  }
291 
292  for (int i = 0; i < output_size; i += 2) {
293  String::FlatContent needle_content = needle->GetFlatContent();
294  String::FlatContent subject_content = subject->GetFlatContent();
295  DCHECK(needle_content.IsFlat());
296  DCHECK(subject_content.IsFlat());
297  // dispatch on type of strings
298  index =
299  (needle_content.IsOneByte()
300  ? (subject_content.IsOneByte()
301  ? SearchString(isolate, subject_content.ToOneByteVector(),
302  needle_content.ToOneByteVector(), index)
303  : SearchString(isolate, subject_content.ToUC16Vector(),
304  needle_content.ToOneByteVector(), index))
305  : (subject_content.IsOneByte()
306  ? SearchString(isolate, subject_content.ToOneByteVector(),
307  needle_content.ToUC16Vector(), index)
308  : SearchString(isolate, subject_content.ToUC16Vector(),
309  needle_content.ToUC16Vector(), index)));
310  if (index == -1) {
311  return i / 2; // Return number of matches.
312  } else {
313  output[i] = index;
314  output[i+1] = index + needle_len;
315  index += needle_len;
316  }
317  }
318  return output_size / 2;
319 }
320 
321 
323  Handle<String> subject,
324  int index,
325  Handle<JSArray> last_match_info) {
326  Isolate* isolate = re->GetIsolate();
327 
328  static const int kNumRegisters = 2;
330  int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
331 
332  int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
333 
334  if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
335 
337  SealHandleScope shs(isolate);
338  FixedArray* array = FixedArray::cast(last_match_info->elements());
339  SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
340  return last_match_info;
341 }
342 
343 
344 // Irregexp implementation.
345 
346 // Ensures that the regexp object contains a compiled version of the
347 // source for either one-byte or two-byte subject strings.
348 // If the compiled version doesn't already exist, it is compiled
349 // from the source pattern.
350 // If compilation fails, an exception is thrown and this function
351 // returns false.
353  Handle<String> sample_subject,
354  bool is_one_byte) {
355  Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
356 #ifdef V8_INTERPRETED_REGEXP
357  if (compiled_code->IsByteArray()) return true;
358 #else // V8_INTERPRETED_REGEXP (RegExp native code)
359  if (compiled_code->IsCode()) return true;
360 #endif
361  // We could potentially have marked this as flushable, but have kept
362  // a saved version if we did not flush it yet.
363  Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
364  if (saved_code->IsCode()) {
365  // Reinstate the code in the original place.
366  re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
367  DCHECK(compiled_code->IsSmi());
368  return true;
369  }
370  return CompileIrregexp(re, sample_subject, is_one_byte);
371 }
372 
373 
375  Handle<String> error_message,
376  Isolate* isolate) {
377  Factory* factory = isolate->factory();
378  Handle<FixedArray> elements = factory->NewFixedArray(2);
379  elements->set(0, re->Pattern());
380  elements->set(1, *error_message);
381  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
382  Handle<Object> error;
383  MaybeHandle<Object> maybe_error =
384  factory->NewSyntaxError("malformed_regexp", array);
385  if (maybe_error.ToHandle(&error)) isolate->Throw(*error);
386 }
387 
388 
390  Handle<String> sample_subject,
391  bool is_one_byte) {
392  // Compile the RegExp.
393  Isolate* isolate = re->GetIsolate();
394  Zone zone(isolate);
395  PostponeInterruptsScope postpone(isolate);
396  // If we had a compilation error the last time this is saved at the
397  // saved code index.
398  Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
399  // When arriving here entry can only be a smi, either representing an
400  // uncompiled regexp, a previous compilation error, or code that has
401  // been flushed.
402  DCHECK(entry->IsSmi());
403  int entry_value = Smi::cast(entry)->value();
404  DCHECK(entry_value == JSRegExp::kUninitializedValue ||
405  entry_value == JSRegExp::kCompilationErrorValue ||
406  (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
407 
408  if (entry_value == JSRegExp::kCompilationErrorValue) {
409  // A previous compilation failed and threw an error which we store in
410  // the saved code index (we store the error message, not the actual
411  // error). Recreate the error object and throw it.
412  Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
413  DCHECK(error_string->IsString());
414  Handle<String> error_message(String::cast(error_string));
415  CreateRegExpErrorObjectAndThrow(re, error_message, isolate);
416  return false;
417  }
418 
419  JSRegExp::Flags flags = re->GetFlags();
420 
421  Handle<String> pattern(re->Pattern());
422  pattern = String::Flatten(pattern);
423  RegExpCompileData compile_data;
424  FlatStringReader reader(isolate, pattern);
425  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
426  &compile_data,
427  &zone)) {
428  // Throw an exception if we fail to parse the pattern.
429  // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
431  pattern,
432  compile_data.error,
433  "malformed_regexp"));
434  return false;
435  }
437  &compile_data, flags.is_ignore_case(), flags.is_global(),
438  flags.is_multiline(), flags.is_sticky(), pattern, sample_subject,
439  is_one_byte, &zone);
440  if (result.error_message != NULL) {
441  // Unable to compile regexp.
442  Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
443  CStrVector(result.error_message)).ToHandleChecked();
444  CreateRegExpErrorObjectAndThrow(re, error_message, isolate);
445  return false;
446  }
447 
448  Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
449  data->set(JSRegExp::code_index(is_one_byte), result.code);
450  int register_max = IrregexpMaxRegisterCount(*data);
451  if (result.num_registers > register_max) {
453  }
454 
455  return true;
456 }
457 
458 
460  return Smi::cast(
462 }
463 
464 
467 }
468 
469 
471  return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
472 }
473 
474 
476  return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
477 }
478 
479 
481  return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
482 }
483 
484 
486  return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
487 }
488 
489 
491  Handle<String> pattern,
493  int capture_count) {
494  // Initialize compiled code entries to null.
495  re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
497  pattern,
498  flags,
499  capture_count);
500 }
501 
502 
504  Handle<String> subject) {
505  subject = String::Flatten(subject);
506 
507  // Check representation of the underlying storage.
508  bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
509  if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
510 
511 #ifdef V8_INTERPRETED_REGEXP
512  // Byte-code regexp needs space allocated for all its registers.
513  // The result captures are copied to the start of the registers array
514  // if the match succeeds. This way those registers are not clobbered
515  // when we set the last match info from last successful match.
516  return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
517  (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
518 #else // V8_INTERPRETED_REGEXP
519  // Native regexp only needs room to output captures. Registers are handled
520  // internally.
521  return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
522 #endif // V8_INTERPRETED_REGEXP
523 }
524 
525 
527  Handle<String> subject,
528  int index,
529  int32_t* output,
530  int output_size) {
531  Isolate* isolate = regexp->GetIsolate();
532 
533  Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
534 
535  DCHECK(index >= 0);
536  DCHECK(index <= subject->length());
537  DCHECK(subject->IsFlat());
538 
539  bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
540 
541 #ifndef V8_INTERPRETED_REGEXP
542  DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
543  do {
544  EnsureCompiledIrregexp(regexp, subject, is_one_byte);
545  Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
546  // The stack is used to allocate registers for the compiled regexp code.
547  // This means that in case of failure, the output registers array is left
548  // untouched and contains the capture results from the previous successful
549  // match. We can use that to set the last match info lazily.
552  subject,
553  output,
554  output_size,
555  index,
556  isolate);
559  isolate->has_pending_exception());
561  static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
563  static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
565  == RE_EXCEPTION);
566  return static_cast<IrregexpResult>(res);
567  }
568  // If result is RETRY, the string has changed representation, and we
569  // must restart from scratch.
570  // In this case, it means we must make sure we are prepared to handle
571  // the, potentially, different subject (the string can switch between
572  // being internal and external, and even between being Latin1 and UC16,
573  // but the characters are always the same).
574  IrregexpPrepare(regexp, subject);
575  is_one_byte = subject->IsOneByteRepresentationUnderneath();
576  } while (true);
577  UNREACHABLE();
578  return RE_EXCEPTION;
579 #else // V8_INTERPRETED_REGEXP
580 
581  DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
582  // We must have done EnsureCompiledIrregexp, so we can get the number of
583  // registers.
584  int number_of_capture_registers =
585  (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
586  int32_t* raw_output = &output[number_of_capture_registers];
587  // We do not touch the actual capture result registers until we know there
588  // has been a match so that we can use those capture results to set the
589  // last match info.
590  for (int i = number_of_capture_registers - 1; i >= 0; i--) {
591  raw_output[i] = -1;
592  }
593  Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
594  isolate);
595 
597  byte_codes,
598  subject,
599  raw_output,
600  index);
601  if (result == RE_SUCCESS) {
602  // Copy capture results to the start of the registers array.
603  MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
604  }
605  if (result == RE_EXCEPTION) {
606  DCHECK(!isolate->has_pending_exception());
607  isolate->StackOverflow();
608  }
609  return result;
610 #endif // V8_INTERPRETED_REGEXP
611 }
612 
613 
615  Handle<String> subject,
616  int previous_index,
617  Handle<JSArray> last_match_info) {
618  Isolate* isolate = regexp->GetIsolate();
619  DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
620 
621  // Prepare space for the return values.
622 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
623  if (FLAG_trace_regexp_bytecodes) {
624  String* pattern = regexp->Pattern();
625  PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
626  PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
627  }
628 #endif
629  int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
630  if (required_registers < 0) {
631  // Compiling failed with an exception.
632  DCHECK(isolate->has_pending_exception());
633  return MaybeHandle<Object>();
634  }
635 
636  int32_t* output_registers = NULL;
637  if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
638  output_registers = NewArray<int32_t>(required_registers);
639  }
640  SmartArrayPointer<int32_t> auto_release(output_registers);
641  if (output_registers == NULL) {
642  output_registers = isolate->jsregexp_static_offsets_vector();
643  }
644 
645  int res = RegExpImpl::IrregexpExecRaw(
646  regexp, subject, previous_index, output_registers, required_registers);
647  if (res == RE_SUCCESS) {
648  int capture_count =
649  IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
650  return SetLastMatchInfo(
651  last_match_info, subject, capture_count, output_registers);
652  }
653  if (res == RE_EXCEPTION) {
654  DCHECK(isolate->has_pending_exception());
655  return MaybeHandle<Object>();
656  }
657  DCHECK(res == RE_FAILURE);
658  return isolate->factory()->null_value();
659 }
660 
661 
663  Handle<String> subject,
664  int capture_count,
665  int32_t* match) {
666  DCHECK(last_match_info->HasFastObjectElements());
667  int capture_register_count = (capture_count + 1) * 2;
668  JSArray::EnsureSize(last_match_info,
669  capture_register_count + kLastMatchOverhead);
670  DisallowHeapAllocation no_allocation;
671  FixedArray* array = FixedArray::cast(last_match_info->elements());
672  if (match != NULL) {
673  for (int i = 0; i < capture_register_count; i += 2) {
674  SetCapture(array, i, match[i]);
675  SetCapture(array, i + 1, match[i + 1]);
676  }
677  }
678  SetLastCaptureCount(array, capture_register_count);
679  SetLastSubject(array, *subject);
680  SetLastInput(array, *subject);
681  return last_match_info;
682 }
683 
684 
686  Handle<String> subject,
687  bool is_global,
688  Isolate* isolate)
689  : register_array_(NULL),
690  register_array_size_(0),
691  regexp_(regexp),
692  subject_(subject) {
693 #ifdef V8_INTERPRETED_REGEXP
694  bool interpreted = true;
695 #else
696  bool interpreted = false;
697 #endif // V8_INTERPRETED_REGEXP
698 
699  if (regexp_->TypeTag() == JSRegExp::ATOM) {
700  static const int kAtomRegistersPerMatch = 2;
701  registers_per_match_ = kAtomRegistersPerMatch;
702  // There is no distinction between interpreted and native for atom regexps.
703  interpreted = false;
704  } else {
706  if (registers_per_match_ < 0) {
707  num_matches_ = -1; // Signal exception.
708  return;
709  }
710  }
711 
712  if (is_global && !interpreted) {
716  } else {
717  // Global loop in interpreted regexp is not implemented. We choose
718  // the size of the offsets vector so that it can only store one match.
720  max_matches_ = 1;
721  }
722 
724  register_array_ = NewArray<int32_t>(register_array_size_);
725  } else {
726  register_array_ = isolate->jsregexp_static_offsets_vector();
727  }
728 
729  // Set state so that fetching the results the first time triggers a call
730  // to the compiled regexp.
733  DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
735  int32_t* last_match =
737  last_match[0] = -1;
738  last_match[1] = 0;
739 }
740 
741 
742 // -------------------------------------------------------------------
743 // Implementation of the Irregexp regular expression engine.
744 //
745 // The Irregexp regular expression engine is intended to be a complete
746 // implementation of ECMAScript regular expressions. It generates either
747 // bytecodes or native code.
748 
749 // The Irregexp regexp engine is structured in three steps.
750 // 1) The parser generates an abstract syntax tree. See ast.cc.
751 // 2) From the AST a node network is created. The nodes are all
752 // subclasses of RegExpNode. The nodes represent states when
753 // executing a regular expression. Several optimizations are
754 // performed on the node network.
755 // 3) From the nodes we generate either byte codes or native code
756 // that can actually execute the regular expression (perform
757 // the search). The code generation step is described in more
758 // detail below.
759 
760 // Code generation.
761 //
762 // The nodes are divided into four main categories.
763 // * Choice nodes
764 // These represent places where the regular expression can
765 // match in more than one way. For example on entry to an
766 // alternation (foo|bar) or a repetition (*, +, ? or {}).
767 // * Action nodes
768 // These represent places where some action should be
769 // performed. Examples include recording the current position
770 // in the input string to a register (in order to implement
771 // captures) or other actions on register for example in order
772 // to implement the counters needed for {} repetitions.
773 // * Matching nodes
774 // These attempt to match some element part of the input string.
775 // Examples of elements include character classes, plain strings
776 // or back references.
777 // * End nodes
778 // These are used to implement the actions required on finding
779 // a successful match or failing to find a match.
780 //
781 // The code generated (whether as byte codes or native code) maintains
782 // some state as it runs. This consists of the following elements:
783 //
784 // * The capture registers. Used for string captures.
785 // * Other registers. Used for counters etc.
786 // * The current position.
787 // * The stack of backtracking information. Used when a matching node
788 // fails to find a match and needs to try an alternative.
789 //
790 // Conceptual regular expression execution model:
791 //
792 // There is a simple conceptual model of regular expression execution
793 // which will be presented first. The actual code generated is a more
794 // efficient simulation of the simple conceptual model:
795 //
796 // * Choice nodes are implemented as follows:
797 // For each choice except the last {
798 // push current position
799 // push backtrack code location
800 // <generate code to test for choice>
801 // backtrack code location:
802 // pop current position
803 // }
804 // <generate code to test for last choice>
805 //
806 // * Actions nodes are generated as follows
807 // <push affected registers on backtrack stack>
808 // <generate code to perform action>
809 // push backtrack code location
810 // <generate code to test for following nodes>
811 // backtrack code location:
812 // <pop affected registers to restore their state>
813 // <pop backtrack location from stack and go to it>
814 //
815 // * Matching nodes are generated as follows:
816 // if input string matches at current position
817 // update current position
818 // <generate code to test for following nodes>
819 // else
820 // <pop backtrack location from stack and go to it>
821 //
822 // Thus it can be seen that the current position is saved and restored
823 // by the choice nodes, whereas the registers are saved and restored by
824 // by the action nodes that manipulate them.
825 //
826 // The other interesting aspect of this model is that nodes are generated
827 // at the point where they are needed by a recursive call to Emit(). If
828 // the node has already been code generated then the Emit() call will
829 // generate a jump to the previously generated code instead. In order to
830 // limit recursion it is possible for the Emit() function to put the node
831 // on a work list for later generation and instead generate a jump. The
832 // destination of the jump is resolved later when the code is generated.
833 //
834 // Actual regular expression code generation.
835 //
836 // Code generation is actually more complicated than the above. In order
837 // to improve the efficiency of the generated code some optimizations are
838 // performed
839 //
840 // * Choice nodes have 1-character lookahead.
841 // A choice node looks at the following character and eliminates some of
842 // the choices immediately based on that character. This is not yet
843 // implemented.
844 // * Simple greedy loops store reduced backtracking information.
845 // A quantifier like /.*foo/m will greedily match the whole input. It will
846 // then need to backtrack to a point where it can match "foo". The naive
847 // implementation of this would push each character position onto the
848 // backtracking stack, then pop them off one by one. This would use space
849 // proportional to the length of the input string. However since the "."
850 // can only match in one way and always has a constant length (in this case
851 // of 1) it suffices to store the current position on the top of the stack
852 // once. Matching now becomes merely incrementing the current position and
853 // backtracking becomes decrementing the current position and checking the
854 // result against the stored current position. This is faster and saves
855 // space.
856 // * The current state is virtualized.
857 // This is used to defer expensive operations until it is clear that they
858 // are needed and to generate code for a node more than once, allowing
859 // specialized an efficient versions of the code to be created. This is
860 // explained in the section below.
861 //
862 // Execution state virtualization.
863 //
864 // Instead of emitting code, nodes that manipulate the state can record their
865 // manipulation in an object called the Trace. The Trace object can record a
866 // current position offset, an optional backtrack code location on the top of
867 // the virtualized backtrack stack and some register changes. When a node is
868 // to be emitted it can flush the Trace or update it. Flushing the Trace
869 // will emit code to bring the actual state into line with the virtual state.
870 // Avoiding flushing the state can postpone some work (e.g. updates of capture
871 // registers). Postponing work can save time when executing the regular
872 // expression since it may be found that the work never has to be done as a
873 // failure to match can occur. In addition it is much faster to jump to a
874 // known backtrack code location than it is to pop an unknown backtrack
875 // location from the stack and jump there.
876 //
877 // The virtual state found in the Trace affects code generation. For example
878 // the virtual state contains the difference between the actual current
879 // position and the virtual current position, and matching code needs to use
880 // this offset to attempt a match in the correct location of the input
881 // string. Therefore code generated for a non-trivial trace is specialized
882 // to that trace. The code generator therefore has the ability to generate
883 // code for each node several times. In order to limit the size of the
884 // generated code there is an arbitrary limit on how many specialized sets of
885 // code may be generated for a given node. If the limit is reached, the
886 // trace is flushed and a generic version of the code for a node is emitted.
887 // This is subsequently used for that node. The code emitted for non-generic
888 // trace is not recorded in the node and so it cannot currently be reused in
889 // the event that code generation is requested for an identical trace.
890 
891 
892 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
893  UNREACHABLE();
894 }
895 
896 
897 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
898  text->AddElement(TextElement::Atom(this), zone);
899 }
900 
901 
902 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
903  text->AddElement(TextElement::CharClass(this), zone);
904 }
905 
906 
907 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
908  for (int i = 0; i < elements()->length(); i++)
909  text->AddElement(elements()->at(i), zone);
910 }
911 
912 
913 TextElement TextElement::Atom(RegExpAtom* atom) {
914  return TextElement(ATOM, atom);
915 }
916 
917 
918 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
919  return TextElement(CHAR_CLASS, char_class);
920 }
921 
922 
923 int TextElement::length() const {
924  switch (text_type()) {
925  case ATOM:
926  return atom()->length();
927 
928  case CHAR_CLASS:
929  return 1;
930  }
931  UNREACHABLE();
932  return 0;
933 }
934 
935 
937  if (table_ == NULL) {
938  table_ = new(zone()) DispatchTable(zone());
939  DispatchTableConstructor cons(table_, ignore_case, zone());
940  cons.BuildTable(this);
941  }
942  return table_;
943 }
944 
945 
947  public:
948  FrequencyCollator() : total_samples_(0) {
949  for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
950  frequencies_[i] = CharacterFrequency(i);
951  }
952  }
953 
954  void CountCharacter(int character) {
955  int index = (character & RegExpMacroAssembler::kTableMask);
956  frequencies_[index].Increment();
957  total_samples_++;
958  }
959 
960  // Does not measure in percent, but rather per-128 (the table size from the
961  // regexp macro assembler).
962  int Frequency(int in_character) {
963  DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
964  if (total_samples_ < 1) return 1; // Division by zero.
965  int freq_in_per128 =
966  (frequencies_[in_character].counter() * 128) / total_samples_;
967  return freq_in_per128;
968  }
969 
970  private:
972  public:
973  CharacterFrequency() : counter_(0), character_(-1) { }
974  explicit CharacterFrequency(int character)
975  : counter_(0), character_(character) { }
976 
977  void Increment() { counter_++; }
978  int counter() { return counter_; }
979  int character() { return character_; }
980 
981  private:
982  int counter_;
984  };
985 
986 
987  private:
990 };
991 
992 
994  public:
995  RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte,
996  Zone* zone);
997 
999  if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1000  reg_exp_too_big_ = true;
1001  return next_register_;
1002  }
1003  return next_register_++;
1004  }
1005 
1007  RegExpNode* start,
1008  int capture_count,
1009  Handle<String> pattern);
1010 
1011  inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1012 
1013  static const int kImplementationOffset = 0;
1014  static const int kNumberOfRegistersOffset = 0;
1015  static const int kCodeOffset = 1;
1016 
1017  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1018  EndNode* accept() { return accept_; }
1019 
1020  static const int kMaxRecursion = 100;
1021  inline int recursion_depth() { return recursion_depth_; }
1022  inline void IncrementRecursionDepth() { recursion_depth_++; }
1023  inline void DecrementRecursionDepth() { recursion_depth_--; }
1024 
1025  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1026 
1027  inline bool ignore_case() { return ignore_case_; }
1028  inline bool one_byte() { return one_byte_; }
1029  FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1030 
1031  int current_expansion_factor() { return current_expansion_factor_; }
1033  current_expansion_factor_ = value;
1034  }
1035 
1036  Zone* zone() const { return zone_; }
1037 
1038  static const int kNoRegister = -1;
1039 
1040  private:
1052 };
1053 
1054 
1056  public:
1057  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1058  compiler->IncrementRecursionDepth();
1059  }
1060  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1061  private:
1063 };
1064 
1065 
1067  return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1068 }
1069 
1070 
1071 // Attempts to compile the regexp using an Irregexp code generator. Returns
1072 // a fixed array or a null handle depending on whether it succeeded.
1073 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case,
1074  bool one_byte, Zone* zone)
1075  : next_register_(2 * (capture_count + 1)),
1076  work_list_(NULL),
1077  recursion_depth_(0),
1078  ignore_case_(ignore_case),
1079  one_byte_(one_byte),
1080  reg_exp_too_big_(false),
1081  current_expansion_factor_(1),
1082  frequency_collator_(),
1083  zone_(zone) {
1086 }
1087 
1088 
1090  RegExpMacroAssembler* macro_assembler,
1091  RegExpNode* start,
1092  int capture_count,
1093  Handle<String> pattern) {
1094  Heap* heap = pattern->GetHeap();
1095 
1096  bool use_slow_safe_regexp_compiler = false;
1097  if (heap->total_regexp_code_generated() >
1099  heap->isolate()->memory_allocator()->SizeExecutable() >
1101  use_slow_safe_regexp_compiler = true;
1102  }
1103 
1104  macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1105 
1106 #ifdef DEBUG
1107  if (FLAG_trace_regexp_assembler)
1109  else
1110 #endif
1112 
1113  List <RegExpNode*> work_list(0);
1114  work_list_ = &work_list;
1115  Label fail;
1117  Trace new_trace;
1118  start->Emit(this, &new_trace);
1119  macro_assembler_->Bind(&fail);
1121  while (!work_list.is_empty()) {
1122  work_list.RemoveLast()->Emit(this, &new_trace);
1123  }
1125 
1126  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1127  heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1128  work_list_ = NULL;
1129 #ifdef DEBUG
1130  if (FLAG_print_code) {
1131  CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1132  OFStream os(trace_scope.file());
1133  Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1134  }
1135  if (FLAG_trace_regexp_assembler) {
1136  delete macro_assembler_;
1137  }
1138 #endif
1140 }
1141 
1142 
1145  Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1146  return range.Contains(that);
1147  } else {
1148  return reg() == that;
1149  }
1150 }
1151 
1152 
1153 bool Trace::mentions_reg(int reg) {
1154  for (DeferredAction* action = actions_;
1155  action != NULL;
1156  action = action->next()) {
1157  if (action->Mentions(reg))
1158  return true;
1159  }
1160  return false;
1161 }
1162 
1163 
1165  DCHECK_EQ(0, *cp_offset);
1166  for (DeferredAction* action = actions_;
1167  action != NULL;
1168  action = action->next()) {
1169  if (action->Mentions(reg)) {
1170  if (action->action_type() == ActionNode::STORE_POSITION) {
1171  *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1172  return true;
1173  } else {
1174  return false;
1175  }
1176  }
1177  }
1178  return false;
1179 }
1180 
1181 
1182 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1183  Zone* zone) {
1184  int max_register = RegExpCompiler::kNoRegister;
1185  for (DeferredAction* action = actions_;
1186  action != NULL;
1187  action = action->next()) {
1188  if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1189  Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1190  for (int i = range.from(); i <= range.to(); i++)
1191  affected_registers->Set(i, zone);
1192  if (range.to() > max_register) max_register = range.to();
1193  } else {
1194  affected_registers->Set(action->reg(), zone);
1195  if (action->reg() > max_register) max_register = action->reg();
1196  }
1197  }
1198  return max_register;
1199 }
1200 
1201 
1203  int max_register,
1204  const OutSet& registers_to_pop,
1205  const OutSet& registers_to_clear) {
1206  for (int reg = max_register; reg >= 0; reg--) {
1207  if (registers_to_pop.Get(reg)) {
1208  assembler->PopRegister(reg);
1209  } else if (registers_to_clear.Get(reg)) {
1210  int clear_to = reg;
1211  while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1212  reg--;
1213  }
1214  assembler->ClearRegisters(reg, clear_to);
1215  }
1216  }
1217 }
1218 
1219 
1221  int max_register,
1222  const OutSet& affected_registers,
1223  OutSet* registers_to_pop,
1224  OutSet* registers_to_clear,
1225  Zone* zone) {
1226  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1227  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1228 
1229  // Count pushes performed to force a stack limit check occasionally.
1230  int pushes = 0;
1231 
1232  for (int reg = 0; reg <= max_register; reg++) {
1233  if (!affected_registers.Get(reg)) {
1234  continue;
1235  }
1236 
1237  // The chronologically first deferred action in the trace
1238  // is used to infer the action needed to restore a register
1239  // to its previous state (or not, if it's safe to ignore it).
1240  enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1241  DeferredActionUndoType undo_action = IGNORE;
1242 
1243  int value = 0;
1244  bool absolute = false;
1245  bool clear = false;
1246  int store_position = -1;
1247  // This is a little tricky because we are scanning the actions in reverse
1248  // historical order (newest first).
1249  for (DeferredAction* action = actions_;
1250  action != NULL;
1251  action = action->next()) {
1252  if (action->Mentions(reg)) {
1253  switch (action->action_type()) {
1254  case ActionNode::SET_REGISTER: {
1256  static_cast<Trace::DeferredSetRegister*>(action);
1257  if (!absolute) {
1258  value += psr->value();
1259  absolute = true;
1260  }
1261  // SET_REGISTER is currently only used for newly introduced loop
1262  // counters. They can have a significant previous value if they
1263  // occour in a loop. TODO(lrn): Propagate this information, so
1264  // we can set undo_action to IGNORE if we know there is no value to
1265  // restore.
1266  undo_action = RESTORE;
1267  DCHECK_EQ(store_position, -1);
1268  DCHECK(!clear);
1269  break;
1270  }
1272  if (!absolute) {
1273  value++;
1274  }
1275  DCHECK_EQ(store_position, -1);
1276  DCHECK(!clear);
1277  undo_action = RESTORE;
1278  break;
1281  static_cast<Trace::DeferredCapture*>(action);
1282  if (!clear && store_position == -1) {
1283  store_position = pc->cp_offset();
1284  }
1285 
1286  // For captures we know that stores and clears alternate.
1287  // Other register, are never cleared, and if the occur
1288  // inside a loop, they might be assigned more than once.
1289  if (reg <= 1) {
1290  // Registers zero and one, aka "capture zero", is
1291  // always set correctly if we succeed. There is no
1292  // need to undo a setting on backtrack, because we
1293  // will set it again or fail.
1294  undo_action = IGNORE;
1295  } else {
1296  undo_action = pc->is_capture() ? CLEAR : RESTORE;
1297  }
1298  DCHECK(!absolute);
1299  DCHECK_EQ(value, 0);
1300  break;
1301  }
1303  // Since we're scanning in reverse order, if we've already
1304  // set the position we have to ignore historically earlier
1305  // clearing operations.
1306  if (store_position == -1) {
1307  clear = true;
1308  }
1309  undo_action = RESTORE;
1310  DCHECK(!absolute);
1311  DCHECK_EQ(value, 0);
1312  break;
1313  }
1314  default:
1315  UNREACHABLE();
1316  break;
1317  }
1318  }
1319  }
1320  // Prepare for the undo-action (e.g., push if it's going to be popped).
1321  if (undo_action == RESTORE) {
1322  pushes++;
1325  if (pushes == push_limit) {
1327  pushes = 0;
1328  }
1329 
1330  assembler->PushRegister(reg, stack_check);
1331  registers_to_pop->Set(reg, zone);
1332  } else if (undo_action == CLEAR) {
1333  registers_to_clear->Set(reg, zone);
1334  }
1335  // Perform the chronologically last action (or accumulated increment)
1336  // for the register.
1337  if (store_position != -1) {
1338  assembler->WriteCurrentPositionToRegister(reg, store_position);
1339  } else if (clear) {
1340  assembler->ClearRegisters(reg, reg);
1341  } else if (absolute) {
1342  assembler->SetRegister(reg, value);
1343  } else if (value != 0) {
1344  assembler->AdvanceRegister(reg, value);
1345  }
1346  }
1347 }
1348 
1349 
1350 // This is called as we come into a loop choice node and some other tricky
1351 // nodes. It normalizes the state of the code generator to ensure we can
1352 // generate generic code.
1353 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1354  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1355 
1356  DCHECK(!is_trivial());
1357 
1358  if (actions_ == NULL && backtrack() == NULL) {
1359  // Here we just have some deferred cp advances to fix and we are back to
1360  // a normal situation. We may also have to forget some information gained
1361  // through a quick check that was already performed.
1362  if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1363  // Create a new trivial state and generate the node with that.
1364  Trace new_state;
1365  successor->Emit(compiler, &new_state);
1366  return;
1367  }
1368 
1369  // Generate deferred actions here along with code to undo them again.
1370  OutSet affected_registers;
1371 
1372  if (backtrack() != NULL) {
1373  // Here we have a concrete backtrack location. These are set up by choice
1374  // nodes and so they indicate that we have a deferred save of the current
1375  // position which we may need to emit here.
1376  assembler->PushCurrentPosition();
1377  }
1378 
1379  int max_register = FindAffectedRegisters(&affected_registers,
1380  compiler->zone());
1381  OutSet registers_to_pop;
1382  OutSet registers_to_clear;
1383  PerformDeferredActions(assembler,
1384  max_register,
1385  affected_registers,
1386  &registers_to_pop,
1387  &registers_to_clear,
1388  compiler->zone());
1389  if (cp_offset_ != 0) {
1390  assembler->AdvanceCurrentPosition(cp_offset_);
1391  }
1392 
1393  // Create a new trivial state and generate the node with that.
1394  Label undo;
1395  assembler->PushBacktrack(&undo);
1396  Trace new_state;
1397  successor->Emit(compiler, &new_state);
1398 
1399  // On backtrack we need to restore state.
1400  assembler->Bind(&undo);
1401  RestoreAffectedRegisters(assembler,
1402  max_register,
1403  registers_to_pop,
1404  registers_to_clear);
1405  if (backtrack() == NULL) {
1406  assembler->Backtrack();
1407  } else {
1408  assembler->PopCurrentPosition();
1409  assembler->GoTo(backtrack());
1410  }
1411 }
1412 
1413 
1415  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1416 
1417  // Omit flushing the trace. We discard the entire stack frame anyway.
1418 
1419  if (!label()->is_bound()) {
1420  // We are completely independent of the trace, since we ignore it,
1421  // so this code can be used as the generic version.
1422  assembler->Bind(label());
1423  }
1424 
1425  // Throw away everything on the backtrack stack since the start
1426  // of the negative submatch and restore the character position.
1427  assembler->ReadCurrentPositionFromRegister(current_position_register_);
1428  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1429  if (clear_capture_count_ > 0) {
1430  // Clear any captures that might have been performed during the success
1431  // of the body of the negative look-ahead.
1432  int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1433  assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1434  }
1435  // Now that we have unwound the stack we find at the top of the stack the
1436  // backtrack that the BeginSubmatch node got.
1437  assembler->Backtrack();
1438 }
1439 
1440 
1441 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1442  if (!trace->is_trivial()) {
1443  trace->Flush(compiler, this);
1444  return;
1445  }
1446  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1447  if (!label()->is_bound()) {
1448  assembler->Bind(label());
1449  }
1450  switch (action_) {
1451  case ACCEPT:
1452  assembler->Succeed();
1453  return;
1454  case BACKTRACK:
1455  assembler->GoTo(trace->backtrack());
1456  return;
1457  case NEGATIVE_SUBMATCH_SUCCESS:
1458  // This case is handled in a different virtual method.
1459  UNREACHABLE();
1460  }
1461  UNIMPLEMENTED();
1462 }
1463 
1464 
1466  if (guards_ == NULL)
1467  guards_ = new(zone) ZoneList<Guard*>(1, zone);
1468  guards_->Add(guard, zone);
1469 }
1470 
1471 
1473  int val,
1474  RegExpNode* on_success) {
1475  ActionNode* result =
1476  new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1477  result->data_.u_store_register.reg = reg;
1478  result->data_.u_store_register.value = val;
1479  return result;
1480 }
1481 
1482 
1484  ActionNode* result =
1485  new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1486  result->data_.u_increment_register.reg = reg;
1487  return result;
1488 }
1489 
1490 
1492  bool is_capture,
1493  RegExpNode* on_success) {
1494  ActionNode* result =
1495  new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1496  result->data_.u_position_register.reg = reg;
1497  result->data_.u_position_register.is_capture = is_capture;
1498  return result;
1499 }
1500 
1501 
1503  RegExpNode* on_success) {
1504  ActionNode* result =
1505  new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1506  result->data_.u_clear_captures.range_from = range.from();
1507  result->data_.u_clear_captures.range_to = range.to();
1508  return result;
1509 }
1510 
1511 
1513  int position_reg,
1514  RegExpNode* on_success) {
1515  ActionNode* result =
1516  new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1517  result->data_.u_submatch.stack_pointer_register = stack_reg;
1518  result->data_.u_submatch.current_position_register = position_reg;
1519  return result;
1520 }
1521 
1522 
1524  int position_reg,
1525  int clear_register_count,
1526  int clear_register_from,
1527  RegExpNode* on_success) {
1528  ActionNode* result =
1529  new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1530  result->data_.u_submatch.stack_pointer_register = stack_reg;
1531  result->data_.u_submatch.current_position_register = position_reg;
1532  result->data_.u_submatch.clear_register_count = clear_register_count;
1533  result->data_.u_submatch.clear_register_from = clear_register_from;
1534  return result;
1535 }
1536 
1537 
1539  int repetition_register,
1540  int repetition_limit,
1541  RegExpNode* on_success) {
1542  ActionNode* result =
1543  new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1544  result->data_.u_empty_match_check.start_register = start_register;
1545  result->data_.u_empty_match_check.repetition_register = repetition_register;
1546  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1547  return result;
1548 }
1549 
1550 
1551 #define DEFINE_ACCEPT(Type) \
1552  void Type##Node::Accept(NodeVisitor* visitor) { \
1553  visitor->Visit##Type(this); \
1554  }
1556 #undef DEFINE_ACCEPT
1557 
1558 
1560  visitor->VisitLoopChoice(this);
1561 }
1562 
1563 
1564 // -------------------------------------------------------------------
1565 // Emit code.
1566 
1567 
1569  Guard* guard,
1570  Trace* trace) {
1571  switch (guard->op()) {
1572  case Guard::LT:
1573  DCHECK(!trace->mentions_reg(guard->reg()));
1574  macro_assembler->IfRegisterGE(guard->reg(),
1575  guard->value(),
1576  trace->backtrack());
1577  break;
1578  case Guard::GEQ:
1579  DCHECK(!trace->mentions_reg(guard->reg()));
1580  macro_assembler->IfRegisterLT(guard->reg(),
1581  guard->value(),
1582  trace->backtrack());
1583  break;
1584  }
1585 }
1586 
1587 
1588 // Returns the number of characters in the equivalence class, omitting those
1589 // that cannot occur in the source string because it is ASCII.
1590 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1591  bool one_byte_subject,
1592  unibrow::uchar* letters) {
1593  int length =
1594  isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1595  // Unibrow returns 0 or 1 for characters where case independence is
1596  // trivial.
1597  if (length == 0) {
1598  letters[0] = character;
1599  length = 1;
1600  }
1601  if (!one_byte_subject || character <= String::kMaxOneByteCharCode) {
1602  return length;
1603  }
1604 
1605  // The standard requires that non-ASCII characters cannot have ASCII
1606  // character codes in their equivalence class.
1607  // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
1608  // is it? For example, \u00C5 is equivalent to \u212B.
1609  return 0;
1610 }
1611 
1612 
1613 static inline bool EmitSimpleCharacter(Isolate* isolate,
1614  RegExpCompiler* compiler,
1615  uc16 c,
1616  Label* on_failure,
1617  int cp_offset,
1618  bool check,
1619  bool preloaded) {
1620  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1621  bool bound_checked = false;
1622  if (!preloaded) {
1623  assembler->LoadCurrentCharacter(
1624  cp_offset,
1625  on_failure,
1626  check);
1627  bound_checked = true;
1628  }
1629  assembler->CheckNotCharacter(c, on_failure);
1630  return bound_checked;
1631 }
1632 
1633 
1634 // Only emits non-letters (things that don't have case). Only used for case
1635 // independent matches.
1636 static inline bool EmitAtomNonLetter(Isolate* isolate,
1637  RegExpCompiler* compiler,
1638  uc16 c,
1639  Label* on_failure,
1640  int cp_offset,
1641  bool check,
1642  bool preloaded) {
1643  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1644  bool one_byte = compiler->one_byte();
1646  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1647  if (length < 1) {
1648  // This can't match. Must be an one-byte subject and a non-one-byte
1649  // character. We do not need to do anything since the one-byte pass
1650  // already handled this.
1651  return false; // Bounds not checked.
1652  }
1653  bool checked = false;
1654  // We handle the length > 1 case in a later pass.
1655  if (length == 1) {
1656  if (one_byte && c > String::kMaxOneByteCharCodeU) {
1657  // Can't match - see above.
1658  return false; // Bounds not checked.
1659  }
1660  if (!preloaded) {
1661  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1662  checked = check;
1663  }
1664  macro_assembler->CheckNotCharacter(c, on_failure);
1665  }
1666  return checked;
1667 }
1668 
1669 
1671  bool one_byte, uc16 c1, uc16 c2,
1672  Label* on_failure) {
1673  uc16 char_mask;
1674  if (one_byte) {
1675  char_mask = String::kMaxOneByteCharCode;
1676  } else {
1677  char_mask = String::kMaxUtf16CodeUnit;
1678  }
1679  uc16 exor = c1 ^ c2;
1680  // Check whether exor has only one bit set.
1681  if (((exor - 1) & exor) == 0) {
1682  // If c1 and c2 differ only by one bit.
1683  // Ecma262UnCanonicalize always gives the highest number last.
1684  DCHECK(c2 > c1);
1685  uc16 mask = char_mask ^ exor;
1686  macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1687  return true;
1688  }
1689  DCHECK(c2 > c1);
1690  uc16 diff = c2 - c1;
1691  if (((diff - 1) & diff) == 0 && c1 >= diff) {
1692  // If the characters differ by 2^n but don't differ by one bit then
1693  // subtract the difference from the found character, then do the or
1694  // trick. We avoid the theoretical case where negative numbers are
1695  // involved in order to simplify code generation.
1696  uc16 mask = char_mask ^ diff;
1697  macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1698  diff,
1699  mask,
1700  on_failure);
1701  return true;
1702  }
1703  return false;
1704 }
1705 
1706 
1707 typedef bool EmitCharacterFunction(Isolate* isolate,
1708  RegExpCompiler* compiler,
1709  uc16 c,
1710  Label* on_failure,
1711  int cp_offset,
1712  bool check,
1713  bool preloaded);
1714 
1715 // Only emits letters (things that have case). Only used for case independent
1716 // matches.
1717 static inline bool EmitAtomLetter(Isolate* isolate,
1718  RegExpCompiler* compiler,
1719  uc16 c,
1720  Label* on_failure,
1721  int cp_offset,
1722  bool check,
1723  bool preloaded) {
1724  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1725  bool one_byte = compiler->one_byte();
1727  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1728  if (length <= 1) return false;
1729  // We may not need to check against the end of the input string
1730  // if this character lies before a character that matched.
1731  if (!preloaded) {
1732  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1733  }
1734  Label ok;
1736  switch (length) {
1737  case 2: {
1738  if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1739  chars[1], on_failure)) {
1740  } else {
1741  macro_assembler->CheckCharacter(chars[0], &ok);
1742  macro_assembler->CheckNotCharacter(chars[1], on_failure);
1743  macro_assembler->Bind(&ok);
1744  }
1745  break;
1746  }
1747  case 4:
1748  macro_assembler->CheckCharacter(chars[3], &ok);
1749  // Fall through!
1750  case 3:
1751  macro_assembler->CheckCharacter(chars[0], &ok);
1752  macro_assembler->CheckCharacter(chars[1], &ok);
1753  macro_assembler->CheckNotCharacter(chars[2], on_failure);
1754  macro_assembler->Bind(&ok);
1755  break;
1756  default:
1757  UNREACHABLE();
1758  break;
1759  }
1760  return true;
1761 }
1762 
1763 
1765  int border,
1766  Label* fall_through,
1767  Label* above_or_equal,
1768  Label* below) {
1769  if (below != fall_through) {
1770  masm->CheckCharacterLT(border, below);
1771  if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1772  } else {
1773  masm->CheckCharacterGT(border - 1, above_or_equal);
1774  }
1775 }
1776 
1777 
1779  int first,
1780  int last,
1781  Label* fall_through,
1782  Label* in_range,
1783  Label* out_of_range) {
1784  if (in_range == fall_through) {
1785  if (first == last) {
1786  masm->CheckNotCharacter(first, out_of_range);
1787  } else {
1788  masm->CheckCharacterNotInRange(first, last, out_of_range);
1789  }
1790  } else {
1791  if (first == last) {
1792  masm->CheckCharacter(first, in_range);
1793  } else {
1794  masm->CheckCharacterInRange(first, last, in_range);
1795  }
1796  if (out_of_range != fall_through) masm->GoTo(out_of_range);
1797  }
1798 }
1799 
1800 
1801 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1802 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1804  RegExpMacroAssembler* masm,
1805  ZoneList<int>* ranges,
1806  int start_index,
1807  int end_index,
1808  int min_char,
1809  Label* fall_through,
1810  Label* even_label,
1811  Label* odd_label) {
1812  static const int kSize = RegExpMacroAssembler::kTableSize;
1813  static const int kMask = RegExpMacroAssembler::kTableMask;
1814 
1815  int base = (min_char & ~kMask);
1816  USE(base);
1817 
1818  // Assert that everything is on one kTableSize page.
1819  for (int i = start_index; i <= end_index; i++) {
1820  DCHECK_EQ(ranges->at(i) & ~kMask, base);
1821  }
1822  DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1823 
1824  char templ[kSize];
1825  Label* on_bit_set;
1826  Label* on_bit_clear;
1827  int bit;
1828  if (even_label == fall_through) {
1829  on_bit_set = odd_label;
1830  on_bit_clear = even_label;
1831  bit = 1;
1832  } else {
1833  on_bit_set = even_label;
1834  on_bit_clear = odd_label;
1835  bit = 0;
1836  }
1837  for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1838  templ[i] = bit;
1839  }
1840  int j = 0;
1841  bit ^= 1;
1842  for (int i = start_index; i < end_index; i++) {
1843  for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1844  templ[j] = bit;
1845  }
1846  bit ^= 1;
1847  }
1848  for (int i = j; i < kSize; i++) {
1849  templ[i] = bit;
1850  }
1851  Factory* factory = masm->zone()->isolate()->factory();
1852  // TODO(erikcorry): Cache these.
1853  Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1854  for (int i = 0; i < kSize; i++) {
1855  ba->set(i, templ[i]);
1856  }
1857  masm->CheckBitInTable(ba, on_bit_set);
1858  if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1859 }
1860 
1861 
1863  ZoneList<int>* ranges,
1864  int start_index,
1865  int end_index,
1866  int cut_index,
1867  Label* even_label,
1868  Label* odd_label) {
1869  bool odd = (((cut_index - start_index) & 1) == 1);
1870  Label* in_range_label = odd ? odd_label : even_label;
1871  Label dummy;
1873  ranges->at(cut_index),
1874  ranges->at(cut_index + 1) - 1,
1875  &dummy,
1876  in_range_label,
1877  &dummy);
1878  DCHECK(!dummy.is_linked());
1879  // Cut out the single range by rewriting the array. This creates a new
1880  // range that is a merger of the two ranges on either side of the one we
1881  // are cutting out. The oddity of the labels is preserved.
1882  for (int j = cut_index; j > start_index; j--) {
1883  ranges->at(j) = ranges->at(j - 1);
1884  }
1885  for (int j = cut_index + 1; j < end_index; j++) {
1886  ranges->at(j) = ranges->at(j + 1);
1887  }
1888 }
1889 
1890 
1891 // Unicode case. Split the search space into kSize spaces that are handled
1892 // with recursion.
1893 static void SplitSearchSpace(ZoneList<int>* ranges,
1894  int start_index,
1895  int end_index,
1896  int* new_start_index,
1897  int* new_end_index,
1898  int* border) {
1899  static const int kSize = RegExpMacroAssembler::kTableSize;
1900  static const int kMask = RegExpMacroAssembler::kTableMask;
1901 
1902  int first = ranges->at(start_index);
1903  int last = ranges->at(end_index) - 1;
1904 
1905  *new_start_index = start_index;
1906  *border = (ranges->at(start_index) & ~kMask) + kSize;
1907  while (*new_start_index < end_index) {
1908  if (ranges->at(*new_start_index) > *border) break;
1909  (*new_start_index)++;
1910  }
1911  // new_start_index is the index of the first edge that is beyond the
1912  // current kSize space.
1913 
1914  // For very large search spaces we do a binary chop search of the non-Latin1
1915  // space instead of just going to the end of the current kSize space. The
1916  // heuristics are complicated a little by the fact that any 128-character
1917  // encoding space can be quickly tested with a table lookup, so we don't
1918  // wish to do binary chop search at a smaller granularity than that. A
1919  // 128-character space can take up a lot of space in the ranges array if,
1920  // for example, we only want to match every second character (eg. the lower
1921  // case characters on some Unicode pages).
1922  int binary_chop_index = (end_index + start_index) / 2;
1923  // The first test ensures that we get to the code that handles the Latin1
1924  // range with a single not-taken branch, speeding up this important
1925  // character range (even non-Latin1 charset-based text has spaces and
1926  // punctuation).
1927  if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1928  end_index - start_index > (*new_start_index - start_index) * 2 &&
1929  last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1930  ranges->at(binary_chop_index) >= first + 2 * kSize) {
1931  int scan_forward_for_section_border = binary_chop_index;;
1932  int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1933 
1934  while (scan_forward_for_section_border < end_index) {
1935  if (ranges->at(scan_forward_for_section_border) > new_border) {
1936  *new_start_index = scan_forward_for_section_border;
1937  *border = new_border;
1938  break;
1939  }
1940  scan_forward_for_section_border++;
1941  }
1942  }
1943 
1944  DCHECK(*new_start_index > start_index);
1945  *new_end_index = *new_start_index - 1;
1946  if (ranges->at(*new_end_index) == *border) {
1947  (*new_end_index)--;
1948  }
1949  if (*border >= ranges->at(end_index)) {
1950  *border = ranges->at(end_index);
1951  *new_start_index = end_index; // Won't be used.
1952  *new_end_index = end_index - 1;
1953  }
1954 }
1955 
1956 
1957 // Gets a series of segment boundaries representing a character class. If the
1958 // character is in the range between an even and an odd boundary (counting from
1959 // start_index) then go to even_label, otherwise go to odd_label. We already
1960 // know that the character is in the range of min_char to max_char inclusive.
1961 // Either label can be NULL indicating backtracking. Either label can also be
1962 // equal to the fall_through label.
1964  ZoneList<int>* ranges,
1965  int start_index,
1966  int end_index,
1967  uc16 min_char,
1968  uc16 max_char,
1969  Label* fall_through,
1970  Label* even_label,
1971  Label* odd_label) {
1972  int first = ranges->at(start_index);
1973  int last = ranges->at(end_index) - 1;
1974 
1975  DCHECK_LT(min_char, first);
1976 
1977  // Just need to test if the character is before or on-or-after
1978  // a particular character.
1979  if (start_index == end_index) {
1980  EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1981  return;
1982  }
1983 
1984  // Another almost trivial case: There is one interval in the middle that is
1985  // different from the end intervals.
1986  if (start_index + 1 == end_index) {
1988  masm, first, last, fall_through, even_label, odd_label);
1989  return;
1990  }
1991 
1992  // It's not worth using table lookup if there are very few intervals in the
1993  // character class.
1994  if (end_index - start_index <= 6) {
1995  // It is faster to test for individual characters, so we look for those
1996  // first, then try arbitrary ranges in the second round.
1997  static int kNoCutIndex = -1;
1998  int cut = kNoCutIndex;
1999  for (int i = start_index; i < end_index; i++) {
2000  if (ranges->at(i) == ranges->at(i + 1) - 1) {
2001  cut = i;
2002  break;
2003  }
2004  }
2005  if (cut == kNoCutIndex) cut = start_index;
2006  CutOutRange(
2007  masm, ranges, start_index, end_index, cut, even_label, odd_label);
2008  DCHECK_GE(end_index - start_index, 2);
2009  GenerateBranches(masm,
2010  ranges,
2011  start_index + 1,
2012  end_index - 1,
2013  min_char,
2014  max_char,
2015  fall_through,
2016  even_label,
2017  odd_label);
2018  return;
2019  }
2020 
2021  // If there are a lot of intervals in the regexp, then we will use tables to
2022  // determine whether the character is inside or outside the character class.
2023  static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2024 
2025  if ((max_char >> kBits) == (min_char >> kBits)) {
2026  EmitUseLookupTable(masm,
2027  ranges,
2028  start_index,
2029  end_index,
2030  min_char,
2031  fall_through,
2032  even_label,
2033  odd_label);
2034  return;
2035  }
2036 
2037  if ((min_char >> kBits) != (first >> kBits)) {
2038  masm->CheckCharacterLT(first, odd_label);
2039  GenerateBranches(masm,
2040  ranges,
2041  start_index + 1,
2042  end_index,
2043  first,
2044  max_char,
2045  fall_through,
2046  odd_label,
2047  even_label);
2048  return;
2049  }
2050 
2051  int new_start_index = 0;
2052  int new_end_index = 0;
2053  int border = 0;
2054 
2055  SplitSearchSpace(ranges,
2056  start_index,
2057  end_index,
2058  &new_start_index,
2059  &new_end_index,
2060  &border);
2061 
2062  Label handle_rest;
2063  Label* above = &handle_rest;
2064  if (border == last + 1) {
2065  // We didn't find any section that started after the limit, so everything
2066  // above the border is one of the terminal labels.
2067  above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2068  DCHECK(new_end_index == end_index - 1);
2069  }
2070 
2071  DCHECK_LE(start_index, new_end_index);
2072  DCHECK_LE(new_start_index, end_index);
2073  DCHECK_LT(start_index, new_start_index);
2074  DCHECK_LT(new_end_index, end_index);
2075  DCHECK(new_end_index + 1 == new_start_index ||
2076  (new_end_index + 2 == new_start_index &&
2077  border == ranges->at(new_end_index + 1)));
2078  DCHECK_LT(min_char, border - 1);
2079  DCHECK_LT(border, max_char);
2080  DCHECK_LT(ranges->at(new_end_index), border);
2081  DCHECK(border < ranges->at(new_start_index) ||
2082  (border == ranges->at(new_start_index) &&
2083  new_start_index == end_index &&
2084  new_end_index == end_index - 1 &&
2085  border == last + 1));
2086  DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2087 
2088  masm->CheckCharacterGT(border - 1, above);
2089  Label dummy;
2090  GenerateBranches(masm,
2091  ranges,
2092  start_index,
2093  new_end_index,
2094  min_char,
2095  border - 1,
2096  &dummy,
2097  even_label,
2098  odd_label);
2099  if (handle_rest.is_linked()) {
2100  masm->Bind(&handle_rest);
2101  bool flip = (new_start_index & 1) != (start_index & 1);
2102  GenerateBranches(masm,
2103  ranges,
2104  new_start_index,
2105  end_index,
2106  border,
2107  max_char,
2108  &dummy,
2109  flip ? odd_label : even_label,
2110  flip ? even_label : odd_label);
2111  }
2112 }
2113 
2114 
2115 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2116  RegExpCharacterClass* cc, bool one_byte,
2117  Label* on_failure, int cp_offset, bool check_offset,
2118  bool preloaded, Zone* zone) {
2119  ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2120  if (!CharacterRange::IsCanonical(ranges)) {
2122  }
2123 
2124  int max_char;
2125  if (one_byte) {
2126  max_char = String::kMaxOneByteCharCode;
2127  } else {
2128  max_char = String::kMaxUtf16CodeUnit;
2129  }
2130 
2131  int range_count = ranges->length();
2132 
2133  int last_valid_range = range_count - 1;
2134  while (last_valid_range >= 0) {
2135  CharacterRange& range = ranges->at(last_valid_range);
2136  if (range.from() <= max_char) {
2137  break;
2138  }
2139  last_valid_range--;
2140  }
2141 
2142  if (last_valid_range < 0) {
2143  if (!cc->is_negated()) {
2144  macro_assembler->GoTo(on_failure);
2145  }
2146  if (check_offset) {
2147  macro_assembler->CheckPosition(cp_offset, on_failure);
2148  }
2149  return;
2150  }
2151 
2152  if (last_valid_range == 0 &&
2153  ranges->at(0).IsEverything(max_char)) {
2154  if (cc->is_negated()) {
2155  macro_assembler->GoTo(on_failure);
2156  } else {
2157  // This is a common case hit by non-anchored expressions.
2158  if (check_offset) {
2159  macro_assembler->CheckPosition(cp_offset, on_failure);
2160  }
2161  }
2162  return;
2163  }
2164  if (last_valid_range == 0 &&
2165  !cc->is_negated() &&
2166  ranges->at(0).IsEverything(max_char)) {
2167  // This is a common case hit by non-anchored expressions.
2168  if (check_offset) {
2169  macro_assembler->CheckPosition(cp_offset, on_failure);
2170  }
2171  return;
2172  }
2173 
2174  if (!preloaded) {
2175  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2176  }
2177 
2178  if (cc->is_standard(zone) &&
2179  macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2180  on_failure)) {
2181  return;
2182  }
2183 
2184 
2185  // A new list with ascending entries. Each entry is a code unit
2186  // where there is a boundary between code units that are part of
2187  // the class and code units that are not. Normally we insert an
2188  // entry at zero which goes to the failure label, but if there
2189  // was already one there we fall through for success on that entry.
2190  // Subsequent entries have alternating meaning (success/failure).
2191  ZoneList<int>* range_boundaries =
2192  new(zone) ZoneList<int>(last_valid_range, zone);
2193 
2194  bool zeroth_entry_is_failure = !cc->is_negated();
2195 
2196  for (int i = 0; i <= last_valid_range; i++) {
2197  CharacterRange& range = ranges->at(i);
2198  if (range.from() == 0) {
2199  DCHECK_EQ(i, 0);
2200  zeroth_entry_is_failure = !zeroth_entry_is_failure;
2201  } else {
2202  range_boundaries->Add(range.from(), zone);
2203  }
2204  range_boundaries->Add(range.to() + 1, zone);
2205  }
2206  int end_index = range_boundaries->length() - 1;
2207  if (range_boundaries->at(end_index) > max_char) {
2208  end_index--;
2209  }
2210 
2211  Label fall_through;
2212  GenerateBranches(macro_assembler,
2213  range_boundaries,
2214  0, // start_index.
2215  end_index,
2216  0, // min_char.
2217  max_char,
2218  &fall_through,
2219  zeroth_entry_is_failure ? &fall_through : on_failure,
2220  zeroth_entry_is_failure ? on_failure : &fall_through);
2221  macro_assembler->Bind(&fall_through);
2222 }
2223 
2224 
2226 }
2227 
2228 
2230  Trace* trace) {
2231  // If we are generating a greedy loop then don't stop and don't reuse code.
2232  if (trace->stop_node() != NULL) {
2233  return CONTINUE;
2234  }
2235 
2236  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2237  if (trace->is_trivial()) {
2238  if (label_.is_bound()) {
2239  // We are being asked to generate a generic version, but that's already
2240  // been done so just go to it.
2241  macro_assembler->GoTo(&label_);
2242  return DONE;
2243  }
2244  if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2245  // To avoid too deep recursion we push the node to the work queue and just
2246  // generate a goto here.
2247  compiler->AddWork(this);
2248  macro_assembler->GoTo(&label_);
2249  return DONE;
2250  }
2251  // Generate generic version of the node and bind the label for later use.
2252  macro_assembler->Bind(&label_);
2253  return CONTINUE;
2254  }
2255 
2256  // We are being asked to make a non-generic version. Keep track of how many
2257  // non-generic versions we generate so as not to overdo it.
2258  trace_count_++;
2259  if (FLAG_regexp_optimization &&
2260  trace_count_ < kMaxCopiesCodeGenerated &&
2262  return CONTINUE;
2263  }
2264 
2265  // If we get here code has been generated for this node too many times or
2266  // recursion is too deep. Time to switch to a generic version. The code for
2267  // generic versions above can handle deep recursion properly.
2268  trace->Flush(compiler, this);
2269  return DONE;
2270 }
2271 
2272 
2273 int ActionNode::EatsAtLeast(int still_to_find,
2274  int budget,
2275  bool not_at_start) {
2276  if (budget <= 0) return 0;
2277  if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2278  return on_success()->EatsAtLeast(still_to_find,
2279  budget - 1,
2280  not_at_start);
2281 }
2282 
2283 
2285  int budget,
2286  BoyerMooreLookahead* bm,
2287  bool not_at_start) {
2288  if (action_type_ == BEGIN_SUBMATCH) {
2289  bm->SetRest(offset);
2290  } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2291  on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2292  }
2293  SaveBMInfo(bm, not_at_start, offset);
2294 }
2295 
2296 
2297 int AssertionNode::EatsAtLeast(int still_to_find,
2298  int budget,
2299  bool not_at_start) {
2300  if (budget <= 0) return 0;
2301  // If we know we are not at the start and we are asked "how many characters
2302  // will you match if you succeed?" then we can answer anything since false
2303  // implies false. So lets just return the max answer (still_to_find) since
2304  // that won't prevent us from preloading a lot of characters for the other
2305  // branches in the node graph.
2306  if (assertion_type() == AT_START && not_at_start) return still_to_find;
2307  return on_success()->EatsAtLeast(still_to_find,
2308  budget - 1,
2309  not_at_start);
2310 }
2311 
2312 
2314  int budget,
2315  BoyerMooreLookahead* bm,
2316  bool not_at_start) {
2317  // Match the behaviour of EatsAtLeast on this node.
2318  if (assertion_type() == AT_START && not_at_start) return;
2319  on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2320  SaveBMInfo(bm, not_at_start, offset);
2321 }
2322 
2323 
2324 int BackReferenceNode::EatsAtLeast(int still_to_find,
2325  int budget,
2326  bool not_at_start) {
2327  if (budget <= 0) return 0;
2328  return on_success()->EatsAtLeast(still_to_find,
2329  budget - 1,
2330  not_at_start);
2331 }
2332 
2333 
2334 int TextNode::EatsAtLeast(int still_to_find,
2335  int budget,
2336  bool not_at_start) {
2337  int answer = Length();
2338  if (answer >= still_to_find) return answer;
2339  if (budget <= 0) return answer;
2340  // We are not at start after this node so we set the last argument to 'true'.
2341  return answer + on_success()->EatsAtLeast(still_to_find - answer,
2342  budget - 1,
2343  true);
2344 }
2345 
2346 
2348  int budget,
2349  bool not_at_start) {
2350  if (budget <= 0) return 0;
2351  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2352  // afterwards.
2353  RegExpNode* node = alternatives_->at(1).node();
2354  return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2355 }
2356 
2357 
2359  QuickCheckDetails* details,
2360  RegExpCompiler* compiler,
2361  int filled_in,
2362  bool not_at_start) {
2363  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2364  // afterwards.
2365  RegExpNode* node = alternatives_->at(1).node();
2366  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2367 }
2368 
2369 
2370 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2371  int budget,
2372  RegExpNode* ignore_this_node,
2373  bool not_at_start) {
2374  if (budget <= 0) return 0;
2375  int min = 100;
2376  int choice_count = alternatives_->length();
2377  budget = (budget - 1) / choice_count;
2378  for (int i = 0; i < choice_count; i++) {
2379  RegExpNode* node = alternatives_->at(i).node();
2380  if (node == ignore_this_node) continue;
2381  int node_eats_at_least =
2382  node->EatsAtLeast(still_to_find, budget, not_at_start);
2383  if (node_eats_at_least < min) min = node_eats_at_least;
2384  if (min == 0) return 0;
2385  }
2386  return min;
2387 }
2388 
2389 
2390 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2391  int budget,
2392  bool not_at_start) {
2393  return EatsAtLeastHelper(still_to_find,
2394  budget - 1,
2395  loop_node_,
2396  not_at_start);
2397 }
2398 
2399 
2400 int ChoiceNode::EatsAtLeast(int still_to_find,
2401  int budget,
2402  bool not_at_start) {
2403  return EatsAtLeastHelper(still_to_find,
2404  budget,
2405  NULL,
2406  not_at_start);
2407 }
2408 
2409 
2410 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2412  v |= v >> 1;
2413  v |= v >> 2;
2414  v |= v >> 4;
2415  v |= v >> 8;
2416  v |= v >> 16;
2417  return v;
2418 }
2419 
2420 
2422  bool found_useful_op = false;
2423  uint32_t char_mask;
2424  if (asc) {
2425  char_mask = String::kMaxOneByteCharCode;
2426  } else {
2427  char_mask = String::kMaxUtf16CodeUnit;
2428  }
2429  mask_ = 0;
2430  value_ = 0;
2431  int char_shift = 0;
2432  for (int i = 0; i < characters_; i++) {
2433  Position* pos = &positions_[i];
2434  if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2435  found_useful_op = true;
2436  }
2437  mask_ |= (pos->mask & char_mask) << char_shift;
2438  value_ |= (pos->value & char_mask) << char_shift;
2439  char_shift += asc ? 8 : 16;
2440  }
2441  return found_useful_op;
2442 }
2443 
2444 
2446  Trace* bounds_check_trace,
2447  Trace* trace,
2448  bool preload_has_checked_bounds,
2449  Label* on_possible_success,
2450  QuickCheckDetails* details,
2451  bool fall_through_on_failure) {
2452  if (details->characters() == 0) return false;
2453  GetQuickCheckDetails(
2454  details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2455  if (details->cannot_match()) return false;
2456  if (!details->Rationalize(compiler->one_byte())) return false;
2457  DCHECK(details->characters() == 1 ||
2458  compiler->macro_assembler()->CanReadUnaligned());
2459  uint32_t mask = details->mask();
2460  uint32_t value = details->value();
2461 
2462  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2463 
2464  if (trace->characters_preloaded() != details->characters()) {
2465  DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2466  // We are attempting to preload the minimum number of characters
2467  // any choice would eat, so if the bounds check fails, then none of the
2468  // choices can succeed, so we can just immediately backtrack, rather
2469  // than go to the next choice.
2470  assembler->LoadCurrentCharacter(trace->cp_offset(),
2471  bounds_check_trace->backtrack(),
2472  !preload_has_checked_bounds,
2473  details->characters());
2474  }
2475 
2476 
2477  bool need_mask = true;
2478 
2479  if (details->characters() == 1) {
2480  // If number of characters preloaded is 1 then we used a byte or 16 bit
2481  // load so the value is already masked down.
2482  uint32_t char_mask;
2483  if (compiler->one_byte()) {
2484  char_mask = String::kMaxOneByteCharCode;
2485  } else {
2486  char_mask = String::kMaxUtf16CodeUnit;
2487  }
2488  if ((mask & char_mask) == char_mask) need_mask = false;
2489  mask &= char_mask;
2490  } else {
2491  // For 2-character preloads in one-byte mode or 1-character preloads in
2492  // two-byte mode we also use a 16 bit load with zero extend.
2493  if (details->characters() == 2 && compiler->one_byte()) {
2494  if ((mask & 0xffff) == 0xffff) need_mask = false;
2495  } else if (details->characters() == 1 && !compiler->one_byte()) {
2496  if ((mask & 0xffff) == 0xffff) need_mask = false;
2497  } else {
2498  if (mask == 0xffffffff) need_mask = false;
2499  }
2500  }
2501 
2502  if (fall_through_on_failure) {
2503  if (need_mask) {
2504  assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2505  } else {
2506  assembler->CheckCharacter(value, on_possible_success);
2507  }
2508  } else {
2509  if (need_mask) {
2510  assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2511  } else {
2512  assembler->CheckNotCharacter(value, trace->backtrack());
2513  }
2514  }
2515  return true;
2516 }
2517 
2518 
2519 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2520 // super-class in the .h file).
2521 //
2522 // We iterate along the text object, building up for each character a
2523 // mask and value that can be used to test for a quick failure to match.
2524 // The masks and values for the positions will be combined into a single
2525 // machine word for the current character width in order to be used in
2526 // generating a quick check.
2528  RegExpCompiler* compiler,
2529  int characters_filled_in,
2530  bool not_at_start) {
2531  Isolate* isolate = compiler->macro_assembler()->zone()->isolate();
2532  DCHECK(characters_filled_in < details->characters());
2533  int characters = details->characters();
2534  int char_mask;
2535  if (compiler->one_byte()) {
2536  char_mask = String::kMaxOneByteCharCode;
2537  } else {
2538  char_mask = String::kMaxUtf16CodeUnit;
2539  }
2540  for (int k = 0; k < elms_->length(); k++) {
2541  TextElement elm = elms_->at(k);
2542  if (elm.text_type() == TextElement::ATOM) {
2543  Vector<const uc16> quarks = elm.atom()->data();
2544  for (int i = 0; i < characters && i < quarks.length(); i++) {
2546  details->positions(characters_filled_in);
2547  uc16 c = quarks[i];
2548  if (c > char_mask) {
2549  // If we expect a non-Latin1 character from an one-byte string,
2550  // there is no way we can match. Not even case-independent
2551  // matching can turn an Latin1 character into non-Latin1 or
2552  // vice versa.
2553  // TODO(dcarney): issue 3550. Verify that this works as expected.
2554  // For example, \u0178 is uppercase of \u00ff (y-umlaut).
2555  details->set_cannot_match();
2556  pos->determines_perfectly = false;
2557  return;
2558  }
2559  if (compiler->ignore_case()) {
2561  int length = GetCaseIndependentLetters(isolate, c,
2562  compiler->one_byte(), chars);
2563  DCHECK(length != 0); // Can only happen if c > char_mask (see above).
2564  if (length == 1) {
2565  // This letter has no case equivalents, so it's nice and simple
2566  // and the mask-compare will determine definitely whether we have
2567  // a match at this character position.
2568  pos->mask = char_mask;
2569  pos->value = c;
2570  pos->determines_perfectly = true;
2571  } else {
2572  uint32_t common_bits = char_mask;
2573  uint32_t bits = chars[0];
2574  for (int j = 1; j < length; j++) {
2575  uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2576  common_bits ^= differing_bits;
2577  bits &= common_bits;
2578  }
2579  // If length is 2 and common bits has only one zero in it then
2580  // our mask and compare instruction will determine definitely
2581  // whether we have a match at this character position. Otherwise
2582  // it can only be an approximate check.
2583  uint32_t one_zero = (common_bits | ~char_mask);
2584  if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2585  pos->determines_perfectly = true;
2586  }
2587  pos->mask = common_bits;
2588  pos->value = bits;
2589  }
2590  } else {
2591  // Don't ignore case. Nice simple case where the mask-compare will
2592  // determine definitely whether we have a match at this character
2593  // position.
2594  pos->mask = char_mask;
2595  pos->value = c;
2596  pos->determines_perfectly = true;
2597  }
2598  characters_filled_in++;
2599  DCHECK(characters_filled_in <= details->characters());
2600  if (characters_filled_in == details->characters()) {
2601  return;
2602  }
2603  }
2604  } else {
2606  details->positions(characters_filled_in);
2607  RegExpCharacterClass* tree = elm.char_class();
2608  ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2609  if (tree->is_negated()) {
2610  // A quick check uses multi-character mask and compare. There is no
2611  // useful way to incorporate a negative char class into this scheme
2612  // so we just conservatively create a mask and value that will always
2613  // succeed.
2614  pos->mask = 0;
2615  pos->value = 0;
2616  } else {
2617  int first_range = 0;
2618  while (ranges->at(first_range).from() > char_mask) {
2619  first_range++;
2620  if (first_range == ranges->length()) {
2621  details->set_cannot_match();
2622  pos->determines_perfectly = false;
2623  return;
2624  }
2625  }
2626  CharacterRange range = ranges->at(first_range);
2627  uc16 from = range.from();
2628  uc16 to = range.to();
2629  if (to > char_mask) {
2630  to = char_mask;
2631  }
2632  uint32_t differing_bits = (from ^ to);
2633  // A mask and compare is only perfect if the differing bits form a
2634  // number like 00011111 with one single block of trailing 1s.
2635  if ((differing_bits & (differing_bits + 1)) == 0 &&
2636  from + differing_bits == to) {
2637  pos->determines_perfectly = true;
2638  }
2639  uint32_t common_bits = ~SmearBitsRight(differing_bits);
2640  uint32_t bits = (from & common_bits);
2641  for (int i = first_range + 1; i < ranges->length(); i++) {
2642  CharacterRange range = ranges->at(i);
2643  uc16 from = range.from();
2644  uc16 to = range.to();
2645  if (from > char_mask) continue;
2646  if (to > char_mask) to = char_mask;
2647  // Here we are combining more ranges into the mask and compare
2648  // value. With each new range the mask becomes more sparse and
2649  // so the chances of a false positive rise. A character class
2650  // with multiple ranges is assumed never to be equivalent to a
2651  // mask and compare operation.
2652  pos->determines_perfectly = false;
2653  uint32_t new_common_bits = (from ^ to);
2654  new_common_bits = ~SmearBitsRight(new_common_bits);
2655  common_bits &= new_common_bits;
2656  bits &= new_common_bits;
2657  uint32_t differing_bits = (from & common_bits) ^ bits;
2658  common_bits ^= differing_bits;
2659  bits &= common_bits;
2660  }
2661  pos->mask = common_bits;
2662  pos->value = bits;
2663  }
2664  characters_filled_in++;
2665  DCHECK(characters_filled_in <= details->characters());
2666  if (characters_filled_in == details->characters()) {
2667  return;
2668  }
2669  }
2670  }
2671  DCHECK(characters_filled_in != details->characters());
2672  if (!details->cannot_match()) {
2673  on_success()-> GetQuickCheckDetails(details,
2674  compiler,
2675  characters_filled_in,
2676  true);
2677  }
2678 }
2679 
2680 
2682  for (int i = 0; i < characters_; i++) {
2683  positions_[i].mask = 0;
2684  positions_[i].value = 0;
2685  positions_[i].determines_perfectly = false;
2686  }
2687  characters_ = 0;
2688 }
2689 
2690 
2691 void QuickCheckDetails::Advance(int by, bool one_byte) {
2692  DCHECK(by >= 0);
2693  if (by >= characters_) {
2694  Clear();
2695  return;
2696  }
2697  for (int i = 0; i < characters_ - by; i++) {
2698  positions_[i] = positions_[by + i];
2699  }
2700  for (int i = characters_ - by; i < characters_; i++) {
2701  positions_[i].mask = 0;
2702  positions_[i].value = 0;
2703  positions_[i].determines_perfectly = false;
2704  }
2705  characters_ -= by;
2706  // We could change mask_ and value_ here but we would never advance unless
2707  // they had already been used in a check and they won't be used again because
2708  // it would gain us nothing. So there's no point.
2709 }
2710 
2711 
2712 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2713  DCHECK(characters_ == other->characters_);
2714  if (other->cannot_match_) {
2715  return;
2716  }
2717  if (cannot_match_) {
2718  *this = *other;
2719  return;
2720  }
2721  for (int i = from_index; i < characters_; i++) {
2722  QuickCheckDetails::Position* pos = positions(i);
2723  QuickCheckDetails::Position* other_pos = other->positions(i);
2724  if (pos->mask != other_pos->mask ||
2725  pos->value != other_pos->value ||
2726  !other_pos->determines_perfectly) {
2727  // Our mask-compare operation will be approximate unless we have the
2728  // exact same operation on both sides of the alternation.
2729  pos->determines_perfectly = false;
2730  }
2731  pos->mask &= other_pos->mask;
2732  pos->value &= pos->mask;
2733  other_pos->value &= pos->mask;
2734  uc16 differing_bits = (pos->value ^ other_pos->value);
2735  pos->mask &= ~differing_bits;
2736  pos->value &= pos->mask;
2737  }
2738 }
2739 
2740 
2742  public:
2743  explicit VisitMarker(NodeInfo* info) : info_(info) {
2744  DCHECK(!info->visited);
2745  info->visited = true;
2746  }
2748  info_->visited = false;
2749  }
2750  private:
2752 };
2753 
2754 
2755 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2756  if (info()->replacement_calculated) return replacement();
2757  if (depth < 0) return this;
2758  DCHECK(!info()->visited);
2759  VisitMarker marker(info());
2760  return FilterSuccessor(depth - 1, ignore_case);
2761 }
2762 
2763 
2764 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2765  RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2766  if (next == NULL) return set_replacement(NULL);
2767  on_success_ = next;
2768  return set_replacement(this);
2769 }
2770 
2771 
2772 // We need to check for the following characters: 0x39c 0x3bc 0x178.
2774  // TODO(dcarney): this could be a lot more efficient.
2775  return range.Contains(0x39c) ||
2776  range.Contains(0x3bc) || range.Contains(0x178);
2777 }
2778 
2779 
2781  for (int i = 0; i < ranges->length(); i++) {
2782  // TODO(dcarney): this could be a lot more efficient.
2783  if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2784  }
2785  return false;
2786 }
2787 
2788 
2789 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2790  if (info()->replacement_calculated) return replacement();
2791  if (depth < 0) return this;
2792  DCHECK(!info()->visited);
2793  VisitMarker marker(info());
2794  int element_count = elms_->length();
2795  for (int i = 0; i < element_count; i++) {
2796  TextElement elm = elms_->at(i);
2797  if (elm.text_type() == TextElement::ATOM) {
2798  Vector<const uc16> quarks = elm.atom()->data();
2799  for (int j = 0; j < quarks.length(); j++) {
2800  uint16_t c = quarks[j];
2801  if (c <= String::kMaxOneByteCharCode) continue;
2802  if (!ignore_case) return set_replacement(NULL);
2803  // Here, we need to check for characters whose upper and lower cases
2804  // are outside the Latin-1 range.
2806  // Character is outside Latin-1 completely
2807  if (converted == 0) return set_replacement(NULL);
2808  // Convert quark to Latin-1 in place.
2809  uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2810  copy[j] = converted;
2811  }
2812  } else {
2813  DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2814  RegExpCharacterClass* cc = elm.char_class();
2815  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2816  if (!CharacterRange::IsCanonical(ranges)) {
2818  }
2819  // Now they are in order so we only need to look at the first.
2820  int range_count = ranges->length();
2821  if (cc->is_negated()) {
2822  if (range_count != 0 &&
2823  ranges->at(0).from() == 0 &&
2824  ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2825  // This will be handled in a later filter.
2826  if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2827  return set_replacement(NULL);
2828  }
2829  } else {
2830  if (range_count == 0 ||
2831  ranges->at(0).from() > String::kMaxOneByteCharCode) {
2832  // This will be handled in a later filter.
2833  if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2834  return set_replacement(NULL);
2835  }
2836  }
2837  }
2838  }
2839  return FilterSuccessor(depth - 1, ignore_case);
2840 }
2841 
2842 
2843 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2844  if (info()->replacement_calculated) return replacement();
2845  if (depth < 0) return this;
2846  if (info()->visited) return this;
2847  {
2848  VisitMarker marker(info());
2849 
2850  RegExpNode* continue_replacement =
2851  continue_node_->FilterOneByte(depth - 1, ignore_case);
2852  // If we can't continue after the loop then there is no sense in doing the
2853  // loop.
2854  if (continue_replacement == NULL) return set_replacement(NULL);
2855  }
2856 
2857  return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2858 }
2859 
2860 
2861 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2862  if (info()->replacement_calculated) return replacement();
2863  if (depth < 0) return this;
2864  if (info()->visited) return this;
2865  VisitMarker marker(info());
2866  int choice_count = alternatives_->length();
2867 
2868  for (int i = 0; i < choice_count; i++) {
2869  GuardedAlternative alternative = alternatives_->at(i);
2870  if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2871  set_replacement(this);
2872  return this;
2873  }
2874  }
2875 
2876  int surviving = 0;
2877  RegExpNode* survivor = NULL;
2878  for (int i = 0; i < choice_count; i++) {
2879  GuardedAlternative alternative = alternatives_->at(i);
2880  RegExpNode* replacement =
2881  alternative.node()->FilterOneByte(depth - 1, ignore_case);
2882  DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2883  if (replacement != NULL) {
2884  alternatives_->at(i).set_node(replacement);
2885  surviving++;
2886  survivor = replacement;
2887  }
2888  }
2889  if (surviving < 2) return set_replacement(survivor);
2890 
2891  set_replacement(this);
2892  if (surviving == choice_count) {
2893  return this;
2894  }
2895  // Only some of the nodes survived the filtering. We need to rebuild the
2896  // alternatives list.
2897  ZoneList<GuardedAlternative>* new_alternatives =
2898  new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2899  for (int i = 0; i < choice_count; i++) {
2900  RegExpNode* replacement =
2901  alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2902  if (replacement != NULL) {
2903  alternatives_->at(i).set_node(replacement);
2904  new_alternatives->Add(alternatives_->at(i), zone());
2905  }
2906  }
2907  alternatives_ = new_alternatives;
2908  return this;
2909 }
2910 
2911 
2913  bool ignore_case) {
2914  if (info()->replacement_calculated) return replacement();
2915  if (depth < 0) return this;
2916  if (info()->visited) return this;
2917  VisitMarker marker(info());
2918  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2919  // afterwards.
2920  RegExpNode* node = alternatives_->at(1).node();
2921  RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2922  if (replacement == NULL) return set_replacement(NULL);
2923  alternatives_->at(1).set_node(replacement);
2924 
2925  RegExpNode* neg_node = alternatives_->at(0).node();
2926  RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2927  // If the negative lookahead is always going to fail then
2928  // we don't need to check it.
2929  if (neg_replacement == NULL) return set_replacement(replacement);
2930  alternatives_->at(0).set_node(neg_replacement);
2931  return set_replacement(this);
2932 }
2933 
2934 
2936  RegExpCompiler* compiler,
2937  int characters_filled_in,
2938  bool not_at_start) {
2939  if (body_can_be_zero_length_ || info()->visited) return;
2940  VisitMarker marker(info());
2941  return ChoiceNode::GetQuickCheckDetails(details,
2942  compiler,
2943  characters_filled_in,
2944  not_at_start);
2945 }
2946 
2947 
2949  int budget,
2950  BoyerMooreLookahead* bm,
2951  bool not_at_start) {
2952  if (body_can_be_zero_length_ || budget <= 0) {
2953  bm->SetRest(offset);
2954  SaveBMInfo(bm, not_at_start, offset);
2955  return;
2956  }
2957  ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2958  SaveBMInfo(bm, not_at_start, offset);
2959 }
2960 
2961 
2963  RegExpCompiler* compiler,
2964  int characters_filled_in,
2965  bool not_at_start) {
2966  not_at_start = (not_at_start || not_at_start_);
2967  int choice_count = alternatives_->length();
2968  DCHECK(choice_count > 0);
2969  alternatives_->at(0).node()->GetQuickCheckDetails(details,
2970  compiler,
2971  characters_filled_in,
2972  not_at_start);
2973  for (int i = 1; i < choice_count; i++) {
2974  QuickCheckDetails new_details(details->characters());
2975  RegExpNode* node = alternatives_->at(i).node();
2976  node->GetQuickCheckDetails(&new_details, compiler,
2977  characters_filled_in,
2978  not_at_start);
2979  // Here we merge the quick match details of the two branches.
2980  details->Merge(&new_details, characters_filled_in);
2981  }
2982 }
2983 
2984 
2985 // Check for [0-9A-Z_a-z].
2986 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2987  Label* word,
2988  Label* non_word,
2989  bool fall_through_on_word) {
2990  if (assembler->CheckSpecialCharacterClass(
2991  fall_through_on_word ? 'w' : 'W',
2992  fall_through_on_word ? non_word : word)) {
2993  // Optimized implementation available.
2994  return;
2995  }
2996  assembler->CheckCharacterGT('z', non_word);
2997  assembler->CheckCharacterLT('0', non_word);
2998  assembler->CheckCharacterGT('a' - 1, word);
2999  assembler->CheckCharacterLT('9' + 1, word);
3000  assembler->CheckCharacterLT('A', non_word);
3001  assembler->CheckCharacterLT('Z' + 1, word);
3002  if (fall_through_on_word) {
3003  assembler->CheckNotCharacter('_', non_word);
3004  } else {
3005  assembler->CheckCharacter('_', word);
3006  }
3007 }
3008 
3009 
3010 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3011 // that matches newline or the start of input).
3012 static void EmitHat(RegExpCompiler* compiler,
3013  RegExpNode* on_success,
3014  Trace* trace) {
3015  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3016  // We will be loading the previous character into the current character
3017  // register.
3018  Trace new_trace(*trace);
3019  new_trace.InvalidateCurrentCharacter();
3020 
3021  Label ok;
3022  if (new_trace.cp_offset() == 0) {
3023  // The start of input counts as a newline in this context, so skip to
3024  // ok if we are at the start.
3025  assembler->CheckAtStart(&ok);
3026  }
3027  // We already checked that we are not at the start of input so it must be
3028  // OK to load the previous character.
3029  assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3030  new_trace.backtrack(),
3031  false);
3032  if (!assembler->CheckSpecialCharacterClass('n',
3033  new_trace.backtrack())) {
3034  // Newline means \n, \r, 0x2028 or 0x2029.
3035  if (!compiler->one_byte()) {
3036  assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3037  }
3038  assembler->CheckCharacter('\n', &ok);
3039  assembler->CheckNotCharacter('\r', new_trace.backtrack());
3040  }
3041  assembler->Bind(&ok);
3042  on_success->Emit(compiler, &new_trace);
3043 }
3044 
3045 
3046 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3048  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3049  Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3050  bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3051  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3052  if (lookahead == NULL) {
3053  int eats_at_least =
3055  kRecursionBudget,
3056  not_at_start));
3057  if (eats_at_least >= 1) {
3058  BoyerMooreLookahead* bm =
3059  new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3060  FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
3061  if (bm->at(0)->is_non_word())
3062  next_is_word_character = Trace::FALSE_VALUE;
3063  if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3064  }
3065  } else {
3066  if (lookahead->at(0)->is_non_word())
3067  next_is_word_character = Trace::FALSE_VALUE;
3068  if (lookahead->at(0)->is_word())
3069  next_is_word_character = Trace::TRUE_VALUE;
3070  }
3071  bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3072  if (next_is_word_character == Trace::UNKNOWN) {
3073  Label before_non_word;
3074  Label before_word;
3075  if (trace->characters_preloaded() != 1) {
3076  assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3077  }
3078  // Fall through on non-word.
3079  EmitWordCheck(assembler, &before_word, &before_non_word, false);
3080  // Next character is not a word character.
3081  assembler->Bind(&before_non_word);
3082  Label ok;
3083  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3084  assembler->GoTo(&ok);
3085 
3086  assembler->Bind(&before_word);
3087  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3088  assembler->Bind(&ok);
3089  } else if (next_is_word_character == Trace::TRUE_VALUE) {
3090  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3091  } else {
3092  DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3093  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3094  }
3095 }
3096 
3097 
3099  RegExpCompiler* compiler,
3100  Trace* trace,
3101  AssertionNode::IfPrevious backtrack_if_previous) {
3102  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3103  Trace new_trace(*trace);
3104  new_trace.InvalidateCurrentCharacter();
3105 
3106  Label fall_through, dummy;
3107 
3108  Label* non_word = backtrack_if_previous == kIsNonWord ?
3109  new_trace.backtrack() :
3110  &fall_through;
3111  Label* word = backtrack_if_previous == kIsNonWord ?
3112  &fall_through :
3113  new_trace.backtrack();
3114 
3115  if (new_trace.cp_offset() == 0) {
3116  // The start of input counts as a non-word character, so the question is
3117  // decided if we are at the start.
3118  assembler->CheckAtStart(non_word);
3119  }
3120  // We already checked that we are not at the start of input so it must be
3121  // OK to load the previous character.
3122  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3123  EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3124 
3125  assembler->Bind(&fall_through);
3126  on_success()->Emit(compiler, &new_trace);
3127 }
3128 
3129 
3131  RegExpCompiler* compiler,
3132  int filled_in,
3133  bool not_at_start) {
3134  if (assertion_type_ == AT_START && not_at_start) {
3135  details->set_cannot_match();
3136  return;
3137  }
3138  return on_success()->GetQuickCheckDetails(details,
3139  compiler,
3140  filled_in,
3141  not_at_start);
3142 }
3143 
3144 
3145 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3146  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3147  switch (assertion_type_) {
3148  case AT_END: {
3149  Label ok;
3150  assembler->CheckPosition(trace->cp_offset(), &ok);
3151  assembler->GoTo(trace->backtrack());
3152  assembler->Bind(&ok);
3153  break;
3154  }
3155  case AT_START: {
3156  if (trace->at_start() == Trace::FALSE_VALUE) {
3157  assembler->GoTo(trace->backtrack());
3158  return;
3159  }
3160  if (trace->at_start() == Trace::UNKNOWN) {
3161  assembler->CheckNotAtStart(trace->backtrack());
3162  Trace at_start_trace = *trace;
3163  at_start_trace.set_at_start(true);
3164  on_success()->Emit(compiler, &at_start_trace);
3165  return;
3166  }
3167  }
3168  break;
3169  case AFTER_NEWLINE:
3170  EmitHat(compiler, on_success(), trace);
3171  return;
3172  case AT_BOUNDARY:
3173  case AT_NON_BOUNDARY: {
3174  EmitBoundaryCheck(compiler, trace);
3175  return;
3176  }
3177  }
3178  on_success()->Emit(compiler, trace);
3179 }
3180 
3181 
3182 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3183  if (quick_check == NULL) return false;
3184  if (offset >= quick_check->characters()) return false;
3185  return quick_check->positions(offset)->determines_perfectly;
3186 }
3187 
3188 
3189 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3190  if (index > *checked_up_to) {
3191  *checked_up_to = index;
3192  }
3193 }
3194 
3195 
3196 // We call this repeatedly to generate code for each pass over the text node.
3197 // The passes are in increasing order of difficulty because we hope one
3198 // of the first passes will fail in which case we are saved the work of the
3199 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3200 // we will check the '%' in the first pass, the case independent 'a' in the
3201 // second pass and the character class in the last pass.
3202 //
3203 // The passes are done from right to left, so for example to test for /bar/
3204 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3205 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3206 // bounds check most of the time. In the example we only need to check for
3207 // end-of-input when loading the putative 'r'.
3208 //
3209 // A slight complication involves the fact that the first character may already
3210 // be fetched into a register by the previous node. In this case we want to
3211 // do the test for that character first. We do this in separate passes. The
3212 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3213 // pass has been performed then subsequent passes will have true in
3214 // first_element_checked to indicate that that character does not need to be
3215 // checked again.
3216 //
3217 // In addition to all this we are passed a Trace, which can
3218 // contain an AlternativeGeneration object. In this AlternativeGeneration
3219 // object we can see details of any quick check that was already passed in
3220 // order to get to the code we are now generating. The quick check can involve
3221 // loading characters, which means we do not need to recheck the bounds
3222 // up to the limit the quick check already checked. In addition the quick
3223 // check can have involved a mask and compare operation which may simplify
3224 // or obviate the need for further checks at some character positions.
3226  TextEmitPassType pass,
3227  bool preloaded,
3228  Trace* trace,
3229  bool first_element_checked,
3230  int* checked_up_to) {
3231  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3232  Isolate* isolate = assembler->zone()->isolate();
3233  bool one_byte = compiler->one_byte();
3234  Label* backtrack = trace->backtrack();
3235  QuickCheckDetails* quick_check = trace->quick_check_performed();
3236  int element_count = elms_->length();
3237  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3238  TextElement elm = elms_->at(i);
3239  int cp_offset = trace->cp_offset() + elm.cp_offset();
3240  if (elm.text_type() == TextElement::ATOM) {
3241  Vector<const uc16> quarks = elm.atom()->data();
3242  for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3243  if (first_element_checked && i == 0 && j == 0) continue;
3244  if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3245  EmitCharacterFunction* emit_function = NULL;
3246  switch (pass) {
3247  case NON_LATIN1_MATCH:
3248  DCHECK(one_byte);
3249  if (quarks[j] > String::kMaxOneByteCharCode) {
3250  assembler->GoTo(backtrack);
3251  return;
3252  }
3253  break;
3254  case NON_LETTER_CHARACTER_MATCH:
3255  emit_function = &EmitAtomNonLetter;
3256  break;
3257  case SIMPLE_CHARACTER_MATCH:
3258  emit_function = &EmitSimpleCharacter;
3259  break;
3260  case CASE_CHARACTER_MATCH:
3261  emit_function = &EmitAtomLetter;
3262  break;
3263  default:
3264  break;
3265  }
3266  if (emit_function != NULL) {
3267  bool bound_checked = emit_function(isolate,
3268  compiler,
3269  quarks[j],
3270  backtrack,
3271  cp_offset + j,
3272  *checked_up_to < cp_offset + j,
3273  preloaded);
3274  if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3275  }
3276  }
3277  } else {
3278  DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3279  if (pass == CHARACTER_CLASS_MATCH) {
3280  if (first_element_checked && i == 0) continue;
3281  if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3282  RegExpCharacterClass* cc = elm.char_class();
3283  EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3284  *checked_up_to < cp_offset, preloaded, zone());
3285  UpdateBoundsCheck(cp_offset, checked_up_to);
3286  }
3287  }
3288  }
3289 }
3290 
3291 
3293  TextElement elm = elms_->last();
3294  DCHECK(elm.cp_offset() >= 0);
3295  return elm.cp_offset() + elm.length();
3296 }
3297 
3298 
3299 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3300  TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3301  if (ignore_case) {
3302  return pass == SIMPLE_CHARACTER_MATCH;
3303  } else {
3304  return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3305  }
3306 }
3307 
3308 
3309 // This generates the code to match a text node. A text node can contain
3310 // straight character sequences (possibly to be matched in a case-independent
3311 // way) and character classes. For efficiency we do not do this in a single
3312 // pass from left to right. Instead we pass over the text node several times,
3313 // emitting code for some character positions every time. See the comment on
3314 // TextEmitPass for details.
3315 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3316  LimitResult limit_result = LimitVersions(compiler, trace);
3317  if (limit_result == DONE) return;
3318  DCHECK(limit_result == CONTINUE);
3319 
3320  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3321  compiler->SetRegExpTooBig();
3322  return;
3323  }
3324 
3325  if (compiler->one_byte()) {
3326  int dummy = 0;
3327  TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3328  }
3329 
3330  bool first_elt_done = false;
3331  int bound_checked_to = trace->cp_offset() - 1;
3332  bound_checked_to += trace->bound_checked_up_to();
3333 
3334  // If a character is preloaded into the current character register then
3335  // check that now.
3336  if (trace->characters_preloaded() == 1) {
3337  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3338  if (!SkipPass(pass, compiler->ignore_case())) {
3339  TextEmitPass(compiler,
3340  static_cast<TextEmitPassType>(pass),
3341  true,
3342  trace,
3343  false,
3344  &bound_checked_to);
3345  }
3346  }
3347  first_elt_done = true;
3348  }
3349 
3350  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3351  if (!SkipPass(pass, compiler->ignore_case())) {
3352  TextEmitPass(compiler,
3353  static_cast<TextEmitPassType>(pass),
3354  false,
3355  trace,
3356  first_elt_done,
3357  &bound_checked_to);
3358  }
3359  }
3360 
3361  Trace successor_trace(*trace);
3362  successor_trace.set_at_start(false);
3363  successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
3364  RecursionCheck rc(compiler);
3365  on_success()->Emit(compiler, &successor_trace);
3366 }
3367 
3368 
3371 }
3372 
3373 
3375  DCHECK(by > 0);
3376  // We don't have an instruction for shifting the current character register
3377  // down or for using a shifted value for anything so lets just forget that
3378  // we preloaded any characters into it.
3380  // Adjust the offsets of the quick check performed information. This
3381  // information is used to find out what we already determined about the
3382  // characters by means of mask and compare.
3383  quick_check_performed_.Advance(by, compiler->one_byte());
3384  cp_offset_ += by;
3386  compiler->SetRegExpTooBig();
3387  cp_offset_ = 0;
3388  }
3390 }
3391 
3392 
3393 void TextNode::MakeCaseIndependent(bool is_one_byte) {
3394  int element_count = elms_->length();
3395  for (int i = 0; i < element_count; i++) {
3396  TextElement elm = elms_->at(i);
3397  if (elm.text_type() == TextElement::CHAR_CLASS) {
3398  RegExpCharacterClass* cc = elm.char_class();
3399  // None of the standard character classes is different in the case
3400  // independent case and it slows us down if we don't know that.
3401  if (cc->is_standard(zone())) continue;
3402  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3403  int range_count = ranges->length();
3404  for (int j = 0; j < range_count; j++) {
3405  ranges->at(j).AddCaseEquivalents(ranges, is_one_byte, zone());
3406  }
3407  }
3408  }
3409 }
3410 
3411 
3413  TextElement elm = elms_->at(elms_->length() - 1);
3414  return elm.cp_offset() + elm.length();
3415 }
3416 
3417 
3419  RegExpCompiler* compiler) {
3420  if (elms_->length() != 1) return NULL;
3421  TextElement elm = elms_->at(0);
3422  if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3423  RegExpCharacterClass* node = elm.char_class();
3424  ZoneList<CharacterRange>* ranges = node->ranges(zone());
3425  if (!CharacterRange::IsCanonical(ranges)) {
3427  }
3428  if (node->is_negated()) {
3429  return ranges->length() == 0 ? on_success() : NULL;
3430  }
3431  if (ranges->length() != 1) return NULL;
3432  uint32_t max_char;
3433  if (compiler->one_byte()) {
3434  max_char = String::kMaxOneByteCharCode;
3435  } else {
3436  max_char = String::kMaxUtf16CodeUnit;
3437  }
3438  return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3439 }
3440 
3441 
3442 // Finds the fixed match length of a sequence of nodes that goes from
3443 // this alternative and back to this choice node. If there are variable
3444 // length nodes or other complications in the way then return a sentinel
3445 // value indicating that a greedy loop cannot be constructed.
3447  GuardedAlternative* alternative) {
3448  int length = 0;
3449  RegExpNode* node = alternative->node();
3450  // Later we will generate code for all these text nodes using recursion
3451  // so we have to limit the max number.
3452  int recursion_depth = 0;
3453  while (node != this) {
3454  if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3455  return kNodeIsTooComplexForGreedyLoops;
3456  }
3457  int node_length = node->GreedyLoopTextLength();
3458  if (node_length == kNodeIsTooComplexForGreedyLoops) {
3459  return kNodeIsTooComplexForGreedyLoops;
3460  }
3461  length += node_length;
3462  SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3463  node = seq_node->on_success();
3464  }
3465  return length;
3466 }
3467 
3468 
3470  DCHECK_EQ(loop_node_, NULL);
3471  AddAlternative(alt);
3472  loop_node_ = alt.node();
3473 }
3474 
3475 
3477  DCHECK_EQ(continue_node_, NULL);
3478  AddAlternative(alt);
3479  continue_node_ = alt.node();
3480 }
3481 
3482 
3483 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3484  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3485  if (trace->stop_node() == this) {
3486  // Back edge of greedy optimized loop node graph.
3487  int text_length =
3488  GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3489  DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3490  // Update the counter-based backtracking info on the stack. This is an
3491  // optimization for greedy loops (see below).
3492  DCHECK(trace->cp_offset() == text_length);
3493  macro_assembler->AdvanceCurrentPosition(text_length);
3494  macro_assembler->GoTo(trace->loop_label());
3495  return;
3496  }
3497  DCHECK(trace->stop_node() == NULL);
3498  if (!trace->is_trivial()) {
3499  trace->Flush(compiler, this);
3500  return;
3501  }
3502  ChoiceNode::Emit(compiler, trace);
3503 }
3504 
3505 
3507  int eats_at_least) {
3508  int preload_characters = Min(4, eats_at_least);
3509  if (compiler->macro_assembler()->CanReadUnaligned()) {
3510  bool one_byte = compiler->one_byte();
3511  if (one_byte) {
3512  if (preload_characters > 4) preload_characters = 4;
3513  // We can't preload 3 characters because there is no machine instruction
3514  // to do that. We can't just load 4 because we could be reading
3515  // beyond the end of the string, which could cause a memory fault.
3516  if (preload_characters == 3) preload_characters = 2;
3517  } else {
3518  if (preload_characters > 2) preload_characters = 2;
3519  }
3520  } else {
3521  if (preload_characters > 1) preload_characters = 1;
3522  }
3523  return preload_characters;
3524 }
3525 
3526 
3527 // This class is used when generating the alternatives in a choice node. It
3528 // records the way the alternative is being code generated.
3530  public:
3532  : possible_success(),
3533  expects_preload(false),
3534  after(),
3535  quick_check_details() { }
3538  Label after;
3540 };
3541 
3542 
3543 // Creates a list of AlternativeGenerations. If the list has a reasonable
3544 // size then it is on the stack, otherwise the excess is on the heap.
3546  public:
3548  : alt_gens_(count, zone) {
3549  for (int i = 0; i < count && i < kAFew; i++) {
3550  alt_gens_.Add(a_few_alt_gens_ + i, zone);
3551  }
3552  for (int i = kAFew; i < count; i++) {
3553  alt_gens_.Add(new AlternativeGeneration(), zone);
3554  }
3555  }
3557  for (int i = kAFew; i < alt_gens_.length(); i++) {
3558  delete alt_gens_[i];
3559  alt_gens_[i] = NULL;
3560  }
3561  }
3562 
3564  return alt_gens_[i];
3565  }
3566 
3567  private:
3568  static const int kAFew = 10;
3570  AlternativeGeneration a_few_alt_gens_[kAFew];
3571 };
3572 
3573 
3574 // The '2' variant is has inclusive from and exclusive to.
3575 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3576 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3577 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
3578  0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
3579  0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
3580  0xFEFF, 0xFF00, 0x10000 };
3582 
3583 static const int kWordRanges[] = {
3584  '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3586 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3588 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3590 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3591  0x2028, 0x202A, 0x10000 };
3593 
3594 
3595 void BoyerMoorePositionInfo::Set(int character) {
3596  SetInterval(Interval(character, character));
3597 }
3598 
3599 
3601  s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3602  w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3603  d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3604  surrogate_ =
3605  AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3606  if (interval.to() - interval.from() >= kMapSize - 1) {
3607  if (map_count_ != kMapSize) {
3608  map_count_ = kMapSize;
3609  for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3610  }
3611  return;
3612  }
3613  for (int i = interval.from(); i <= interval.to(); i++) {
3614  int mod_character = (i & kMask);
3615  if (!map_->at(mod_character)) {
3616  map_count_++;
3617  map_->at(mod_character) = true;
3618  }
3619  if (map_count_ == kMapSize) return;
3620  }
3621 }
3622 
3623 
3625  s_ = w_ = d_ = kLatticeUnknown;
3626  if (map_count_ != kMapSize) {
3627  map_count_ = kMapSize;
3628  for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3629  }
3630 }
3631 
3632 
3634  int length, RegExpCompiler* compiler, Zone* zone)
3635  : length_(length),
3636  compiler_(compiler) {
3637  if (compiler->one_byte()) {
3639  } else {
3641  }
3643  for (int i = 0; i < length; i++) {
3644  bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3645  }
3646 }
3647 
3648 
3649 // Find the longest range of lookahead that has the fewest number of different
3650 // characters that can occur at a given position. Since we are optimizing two
3651 // different parameters at once this is a tradeoff.
3653  int biggest_points = 0;
3654  // If more than 32 characters out of 128 can occur it is unlikely that we can
3655  // be lucky enough to step forwards much of the time.
3656  const int kMaxMax = 32;
3657  for (int max_number_of_chars = 4;
3658  max_number_of_chars < kMaxMax;
3659  max_number_of_chars *= 2) {
3660  biggest_points =
3661  FindBestInterval(max_number_of_chars, biggest_points, from, to);
3662  }
3663  if (biggest_points == 0) return false;
3664  return true;
3665 }
3666 
3667 
3668 // Find the highest-points range between 0 and length_ where the character
3669 // information is not too vague. 'Too vague' means that there are more than
3670 // max_number_of_chars that can occur at this position. Calculates the number
3671 // of points as the product of width-of-the-range and
3672 // probability-of-finding-one-of-the-characters, where the probability is
3673 // calculated using the frequency distribution of the sample subject string.
3675  int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3676  int biggest_points = old_biggest_points;
3677  static const int kSize = RegExpMacroAssembler::kTableSize;
3678  for (int i = 0; i < length_; ) {
3679  while (i < length_ && Count(i) > max_number_of_chars) i++;
3680  if (i == length_) break;
3681  int remembered_from = i;
3682  bool union_map[kSize];
3683  for (int j = 0; j < kSize; j++) union_map[j] = false;
3684  while (i < length_ && Count(i) <= max_number_of_chars) {
3686  for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3687  i++;
3688  }
3689  int frequency = 0;
3690  for (int j = 0; j < kSize; j++) {
3691  if (union_map[j]) {
3692  // Add 1 to the frequency to give a small per-character boost for
3693  // the cases where our sampling is not good enough and many
3694  // characters have a frequency of zero. This means the frequency
3695  // can theoretically be up to 2*kSize though we treat it mostly as
3696  // a fraction of kSize.
3697  frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3698  }
3699  }
3700  // We use the probability of skipping times the distance we are skipping to
3701  // judge the effectiveness of this. Actually we have a cut-off: By
3702  // dividing by 2 we switch off the skipping if the probability of skipping
3703  // is less than 50%. This is because the multibyte mask-and-compare
3704  // skipping in quickcheck is more likely to do well on this case.
3705  bool in_quickcheck_range =
3706  ((i - remembered_from < 4) ||
3707  (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3708  // Called 'probability' but it is only a rough estimate and can actually
3709  // be outside the 0-kSize range.
3710  int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3711  int points = (i - remembered_from) * probability;
3712  if (points > biggest_points) {
3713  *from = remembered_from;
3714  *to = i - 1;
3715  biggest_points = points;
3716  }
3717  }
3718  return biggest_points;
3719 }
3720 
3721 
3722 // Take all the characters that will not prevent a successful match if they
3723 // occur in the subject string in the range between min_lookahead and
3724 // max_lookahead (inclusive) measured from the current position. If the
3725 // character at max_lookahead offset is not one of these characters, then we
3726 // can safely skip forwards by the number of characters in the range.
3728  int max_lookahead,
3729  Handle<ByteArray> boolean_skip_table) {
3730  const int kSize = RegExpMacroAssembler::kTableSize;
3731 
3732  const int kSkipArrayEntry = 0;
3733  const int kDontSkipArrayEntry = 1;
3734 
3735  for (int i = 0; i < kSize; i++) {
3736  boolean_skip_table->set(i, kSkipArrayEntry);
3737  }
3738  int skip = max_lookahead + 1 - min_lookahead;
3739 
3740  for (int i = max_lookahead; i >= min_lookahead; i--) {
3742  for (int j = 0; j < kSize; j++) {
3743  if (map->at(j)) {
3744  boolean_skip_table->set(j, kDontSkipArrayEntry);
3745  }
3746  }
3747  }
3748 
3749  return skip;
3750 }
3751 
3752 
3753 // See comment above on the implementation of GetSkipTable.
3755  const int kSize = RegExpMacroAssembler::kTableSize;
3756 
3757  int min_lookahead = 0;
3758  int max_lookahead = 0;
3759 
3760  if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3761 
3762  bool found_single_character = false;
3763  int single_character = 0;
3764  for (int i = max_lookahead; i >= min_lookahead; i--) {
3766  if (map->map_count() > 1 ||
3767  (found_single_character && map->map_count() != 0)) {
3768  found_single_character = false;
3769  break;
3770  }
3771  for (int j = 0; j < kSize; j++) {
3772  if (map->at(j)) {
3773  found_single_character = true;
3774  single_character = j;
3775  break;
3776  }
3777  }
3778  }
3779 
3780  int lookahead_width = max_lookahead + 1 - min_lookahead;
3781 
3782  if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3783  // The mask-compare can probably handle this better.
3784  return;
3785  }
3786 
3787  if (found_single_character) {
3788  Label cont, again;
3789  masm->Bind(&again);
3790  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3791  if (max_char_ > kSize) {
3792  masm->CheckCharacterAfterAnd(single_character,
3794  &cont);
3795  } else {
3796  masm->CheckCharacter(single_character, &cont);
3797  }
3798  masm->AdvanceCurrentPosition(lookahead_width);
3799  masm->GoTo(&again);
3800  masm->Bind(&cont);
3801  return;
3802  }
3803 
3804  Factory* factory = masm->zone()->isolate()->factory();
3805  Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3806  int skip_distance = GetSkipTable(
3807  min_lookahead, max_lookahead, boolean_skip_table);
3808  DCHECK(skip_distance != 0);
3809 
3810  Label cont, again;
3811  masm->Bind(&again);
3812  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3813  masm->CheckBitInTable(boolean_skip_table, &cont);
3814  masm->AdvanceCurrentPosition(skip_distance);
3815  masm->GoTo(&again);
3816  masm->Bind(&cont);
3817 }
3818 
3819 
3820 /* Code generation for choice nodes.
3821  *
3822  * We generate quick checks that do a mask and compare to eliminate a
3823  * choice. If the quick check succeeds then it jumps to the continuation to
3824  * do slow checks and check subsequent nodes. If it fails (the common case)
3825  * it falls through to the next choice.
3826  *
3827  * Here is the desired flow graph. Nodes directly below each other imply
3828  * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3829  * 3 doesn't have a quick check so we have to call the slow check.
3830  * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3831  * regexp continuation is generated directly after the Sn node, up to the
3832  * next GoTo if we decide to reuse some already generated code. Some
3833  * nodes expect preload_characters to be preloaded into the current
3834  * character register. R nodes do this preloading. Vertices are marked
3835  * F for failures and S for success (possible success in the case of quick
3836  * nodes). L, V, < and > are used as arrow heads.
3837  *
3838  * ----------> R
3839  * |
3840  * V
3841  * Q1 -----> S1
3842  * | S /
3843  * F| /
3844  * | F/
3845  * | /
3846  * | R
3847  * | /
3848  * V L
3849  * Q2 -----> S2
3850  * | S /
3851  * F| /
3852  * | F/
3853  * | /
3854  * | R
3855  * | /
3856  * V L
3857  * S3
3858  * |
3859  * F|
3860  * |
3861  * R
3862  * |
3863  * backtrack V
3864  * <----------Q4
3865  * \ F |
3866  * \ |S
3867  * \ F V
3868  * \-----S4
3869  *
3870  * For greedy loops we push the current position, then generate the code that
3871  * eats the input specially in EmitGreedyLoop. The other choice (the
3872  * continuation) is generated by the normal code in EmitChoices, and steps back
3873  * in the input to the starting position when it fails to match. The loop code
3874  * looks like this (U is the unwind code that steps back in the greedy loop).
3875  *
3876  * _____
3877  * / \
3878  * V |
3879  * ----------> S1 |
3880  * /| |
3881  * / |S |
3882  * F/ \_____/
3883  * /
3884  * |<-----
3885  * | \
3886  * V |S
3887  * Q2 ---> U----->backtrack
3888  * | F /
3889  * S| /
3890  * V F /
3891  * S2--/
3892  */
3893 
3896  if (not_at_start) counter_backtrack_trace_.set_at_start(false);
3897 }
3898 
3899 
3901 #ifdef DEBUG
3902  int choice_count = alternatives_->length();
3903  for (int i = 0; i < choice_count - 1; i++) {
3904  GuardedAlternative alternative = alternatives_->at(i);
3905  ZoneList<Guard*>* guards = alternative.guards();
3906  int guard_count = (guards == NULL) ? 0 : guards->length();
3907  for (int j = 0; j < guard_count; j++) {
3908  DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3909  }
3910  }
3911 #endif
3912 }
3913 
3914 
3916  Trace* current_trace,
3917  PreloadState* state) {
3919  // Save some time by looking at most one machine word ahead.
3920  state->eats_at_least_ =
3921  EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3922  current_trace->at_start() == Trace::FALSE_VALUE);
3923  }
3924  state->preload_characters_ =
3925  CalculatePreloadCharacters(compiler, state->eats_at_least_);
3926 
3927  state->preload_is_current_ =
3928  (current_trace->characters_preloaded() == state->preload_characters_);
3930 }
3931 
3932 
3933 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3934  int choice_count = alternatives_->length();
3935 
3937 
3938  LimitResult limit_result = LimitVersions(compiler, trace);
3939  if (limit_result == DONE) return;
3940  DCHECK(limit_result == CONTINUE);
3941 
3942  // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3943  // other choice nodes we only flush if we are out of code size budget.
3944  if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3945  trace->Flush(compiler, this);
3946  return;
3947  }
3948 
3949  RecursionCheck rc(compiler);
3950 
3951  PreloadState preload;
3952  preload.init();
3953  GreedyLoopState greedy_loop_state(not_at_start());
3954 
3955  int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3956  AlternativeGenerationList alt_gens(choice_count, zone());
3957 
3958  if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3959  trace = EmitGreedyLoop(compiler,
3960  trace,
3961  &alt_gens,
3962  &preload,
3963  &greedy_loop_state,
3964  text_length);
3965  } else {
3966  // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3967  // match the traces produced pre-cleanup.
3968  Label second_choice;
3969  compiler->macro_assembler()->Bind(&second_choice);
3970 
3971  preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3972 
3973  EmitChoices(compiler,
3974  &alt_gens,
3975  0,
3976  trace,
3977  &preload);
3978  }
3979 
3980  // At this point we need to generate slow checks for the alternatives where
3981  // the quick check was inlined. We can recognize these because the associated
3982  // label was bound.
3983  int new_flush_budget = trace->flush_budget() / choice_count;
3984  for (int i = 0; i < choice_count; i++) {
3985  AlternativeGeneration* alt_gen = alt_gens.at(i);
3986  Trace new_trace(*trace);
3987  // If there are actions to be flushed we have to limit how many times
3988  // they are flushed. Take the budget of the parent trace and distribute
3989  // it fairly amongst the children.
3990  if (new_trace.actions() != NULL) {
3991  new_trace.set_flush_budget(new_flush_budget);
3992  }
3993  bool next_expects_preload =
3994  i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3995  EmitOutOfLineContinuation(compiler,
3996  &new_trace,
3997  alternatives_->at(i),
3998  alt_gen,
3999  preload.preload_characters_,
4000  next_expects_preload);
4001  }
4002 }
4003 
4004 
4006  Trace* trace,
4007  AlternativeGenerationList* alt_gens,
4008  PreloadState* preload,
4009  GreedyLoopState* greedy_loop_state,
4010  int text_length) {
4011  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4012  // Here we have special handling for greedy loops containing only text nodes
4013  // and other simple nodes. These are handled by pushing the current
4014  // position on the stack and then incrementing the current position each
4015  // time around the switch. On backtrack we decrement the current position
4016  // and check it against the pushed value. This avoids pushing backtrack
4017  // information for each iteration of the loop, which could take up a lot of
4018  // space.
4019  DCHECK(trace->stop_node() == NULL);
4020  macro_assembler->PushCurrentPosition();
4021  Label greedy_match_failed;
4022  Trace greedy_match_trace;
4023  if (not_at_start()) greedy_match_trace.set_at_start(false);
4024  greedy_match_trace.set_backtrack(&greedy_match_failed);
4025  Label loop_label;
4026  macro_assembler->Bind(&loop_label);
4027  greedy_match_trace.set_stop_node(this);
4028  greedy_match_trace.set_loop_label(&loop_label);
4029  alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4030  macro_assembler->Bind(&greedy_match_failed);
4031 
4032  Label second_choice; // For use in greedy matches.
4033  macro_assembler->Bind(&second_choice);
4034 
4035  Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4036 
4037  EmitChoices(compiler,
4038  alt_gens,
4039  1,
4040  new_trace,
4041  preload);
4042 
4043  macro_assembler->Bind(greedy_loop_state->label());
4044  // If we have unwound to the bottom then backtrack.
4045  macro_assembler->CheckGreedyLoop(trace->backtrack());
4046  // Otherwise try the second priority at an earlier position.
4047  macro_assembler->AdvanceCurrentPosition(-text_length);
4048  macro_assembler->GoTo(&second_choice);
4049  return new_trace;
4050 }
4051 
4053  Trace* trace) {
4054  int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4055  if (alternatives_->length() != 2) return eats_at_least;
4056 
4057  GuardedAlternative alt1 = alternatives_->at(1);
4058  if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4059  return eats_at_least;
4060  }
4061  RegExpNode* eats_anything_node = alt1.node();
4062  if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4063  return eats_at_least;
4064  }
4065 
4066  // Really we should be creating a new trace when we execute this function,
4067  // but there is no need, because the code it generates cannot backtrack, and
4068  // we always arrive here with a trivial trace (since it's the entry to a
4069  // loop. That also implies that there are no preloaded characters, which is
4070  // good, because it means we won't be violating any assumptions by
4071  // overwriting those characters with new load instructions.
4072  DCHECK(trace->is_trivial());
4073 
4074  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4075  // At this point we know that we are at a non-greedy loop that will eat
4076  // any character one at a time. Any non-anchored regexp has such a
4077  // loop prepended to it in order to find where it starts. We look for
4078  // a pattern of the form ...abc... where we can look 6 characters ahead
4079  // and step forwards 3 if the character is not one of abc. Abc need
4080  // not be atoms, they can be any reasonably limited character class or
4081  // small alternation.
4082  BoyerMooreLookahead* bm = bm_info(false);
4083  if (bm == NULL) {
4084  eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4087  false));
4088  if (eats_at_least >= 1) {
4089  bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4090  compiler,
4091  zone());
4092  GuardedAlternative alt0 = alternatives_->at(0);
4093  alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);
4094  }
4095  }
4096  if (bm != NULL) {
4097  bm->EmitSkipInstructions(macro_assembler);
4098  }
4099  return eats_at_least;
4100 }
4101 
4102 
4104  AlternativeGenerationList* alt_gens,
4105  int first_choice,
4106  Trace* trace,
4107  PreloadState* preload) {
4108  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4109  SetUpPreLoad(compiler, trace, preload);
4110 
4111  // For now we just call all choices one after the other. The idea ultimately
4112  // is to use the Dispatch table to try only the relevant ones.
4113  int choice_count = alternatives_->length();
4114 
4115  int new_flush_budget = trace->flush_budget() / choice_count;
4116 
4117  for (int i = first_choice; i < choice_count; i++) {
4118  bool is_last = i == choice_count - 1;
4119  bool fall_through_on_failure = !is_last;
4120  GuardedAlternative alternative = alternatives_->at(i);
4121  AlternativeGeneration* alt_gen = alt_gens->at(i);
4123  ZoneList<Guard*>* guards = alternative.guards();
4124  int guard_count = (guards == NULL) ? 0 : guards->length();
4125  Trace new_trace(*trace);
4126  new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4127  preload->preload_characters_ :
4128  0);
4129  if (preload->preload_has_checked_bounds_) {
4130  new_trace.set_bound_checked_up_to(preload->preload_characters_);
4131  }
4132  new_trace.quick_check_performed()->Clear();
4134  if (!is_last) {
4135  new_trace.set_backtrack(&alt_gen->after);
4136  }
4137  alt_gen->expects_preload = preload->preload_is_current_;
4138  bool generate_full_check_inline = false;
4139  if (FLAG_regexp_optimization &&
4141  alternative.node()->EmitQuickCheck(compiler,
4142  trace,
4143  &new_trace,
4144  preload->preload_has_checked_bounds_,
4145  &alt_gen->possible_success,
4146  &alt_gen->quick_check_details,
4147  fall_through_on_failure)) {
4148  // Quick check was generated for this choice.
4149  preload->preload_is_current_ = true;
4150  preload->preload_has_checked_bounds_ = true;
4151  // If we generated the quick check to fall through on possible success,
4152  // we now need to generate the full check inline.
4153  if (!fall_through_on_failure) {
4154  macro_assembler->Bind(&alt_gen->possible_success);
4155  new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4156  new_trace.set_characters_preloaded(preload->preload_characters_);
4157  new_trace.set_bound_checked_up_to(preload->preload_characters_);
4158  generate_full_check_inline = true;
4159  }
4160  } else if (alt_gen->quick_check_details.cannot_match()) {
4161  if (!fall_through_on_failure) {
4162  macro_assembler->GoTo(trace->backtrack());
4163  }
4164  continue;
4165  } else {
4166  // No quick check was generated. Put the full code here.
4167  // If this is not the first choice then there could be slow checks from
4168  // previous cases that go here when they fail. There's no reason to
4169  // insist that they preload characters since the slow check we are about
4170  // to generate probably can't use it.
4171  if (i != first_choice) {
4172  alt_gen->expects_preload = false;
4173  new_trace.InvalidateCurrentCharacter();
4174  }
4175  generate_full_check_inline = true;
4176  }
4177  if (generate_full_check_inline) {
4178  if (new_trace.actions() != NULL) {
4179  new_trace.set_flush_budget(new_flush_budget);
4180  }
4181  for (int j = 0; j < guard_count; j++) {
4182  GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4183  }
4184  alternative.node()->Emit(compiler, &new_trace);
4185  preload->preload_is_current_ = false;
4186  }
4187  macro_assembler->Bind(&alt_gen->after);
4188  }
4189 }
4190 
4191 
4193  Trace* trace,
4194  GuardedAlternative alternative,
4195  AlternativeGeneration* alt_gen,
4196  int preload_characters,
4197  bool next_expects_preload) {
4198  if (!alt_gen->possible_success.is_linked()) return;
4199 
4200  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4201  macro_assembler->Bind(&alt_gen->possible_success);
4202  Trace out_of_line_trace(*trace);
4203  out_of_line_trace.set_characters_preloaded(preload_characters);
4204  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4205  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4206  ZoneList<Guard*>* guards = alternative.guards();
4207  int guard_count = (guards == NULL) ? 0 : guards->length();
4208  if (next_expects_preload) {
4209  Label reload_current_char;
4210  out_of_line_trace.set_backtrack(&reload_current_char);
4211  for (int j = 0; j < guard_count; j++) {
4212  GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4213  }
4214  alternative.node()->Emit(compiler, &out_of_line_trace);
4215  macro_assembler->Bind(&reload_current_char);
4216  // Reload the current character, since the next quick check expects that.
4217  // We don't need to check bounds here because we only get into this
4218  // code through a quick check which already did the checked load.
4219  macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4220  NULL,
4221  false,
4222  preload_characters);
4223  macro_assembler->GoTo(&(alt_gen->after));
4224  } else {
4225  out_of_line_trace.set_backtrack(&(alt_gen->after));
4226  for (int j = 0; j < guard_count; j++) {
4227  GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4228  }
4229  alternative.node()->Emit(compiler, &out_of_line_trace);
4230  }
4231 }
4232 
4233 
4234 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4235  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4236  LimitResult limit_result = LimitVersions(compiler, trace);
4237  if (limit_result == DONE) return;
4238  DCHECK(limit_result == CONTINUE);
4239 
4240  RecursionCheck rc(compiler);
4241 
4242  switch (action_type_) {
4243  case STORE_POSITION: {
4245  new_capture(data_.u_position_register.reg,
4246  data_.u_position_register.is_capture,
4247  trace);
4248  Trace new_trace = *trace;
4249  new_trace.add_action(&new_capture);
4250  on_success()->Emit(compiler, &new_trace);
4251  break;
4252  }
4253  case INCREMENT_REGISTER: {
4255  new_increment(data_.u_increment_register.reg);
4256  Trace new_trace = *trace;
4257  new_trace.add_action(&new_increment);
4258  on_success()->Emit(compiler, &new_trace);
4259  break;
4260  }
4261  case SET_REGISTER: {
4263  new_set(data_.u_store_register.reg, data_.u_store_register.value);
4264  Trace new_trace = *trace;
4265  new_trace.add_action(&new_set);
4266  on_success()->Emit(compiler, &new_trace);
4267  break;
4268  }
4269  case CLEAR_CAPTURES: {
4271  new_capture(Interval(data_.u_clear_captures.range_from,
4272  data_.u_clear_captures.range_to));
4273  Trace new_trace = *trace;
4274  new_trace.add_action(&new_capture);
4275  on_success()->Emit(compiler, &new_trace);
4276  break;
4277  }
4278  case BEGIN_SUBMATCH:
4279  if (!trace->is_trivial()) {
4280  trace->Flush(compiler, this);
4281  } else {
4282  assembler->WriteCurrentPositionToRegister(
4283  data_.u_submatch.current_position_register, 0);
4284  assembler->WriteStackPointerToRegister(
4285  data_.u_submatch.stack_pointer_register);
4286  on_success()->Emit(compiler, trace);
4287  }
4288  break;
4289  case EMPTY_MATCH_CHECK: {
4290  int start_pos_reg = data_.u_empty_match_check.start_register;
4291  int stored_pos = 0;
4292  int rep_reg = data_.u_empty_match_check.repetition_register;
4293  bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4294  bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4295  if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4296  // If we know we haven't advanced and there is no minimum we
4297  // can just backtrack immediately.
4298  assembler->GoTo(trace->backtrack());
4299  } else if (know_dist && stored_pos < trace->cp_offset()) {
4300  // If we know we've advanced we can generate the continuation
4301  // immediately.
4302  on_success()->Emit(compiler, trace);
4303  } else if (!trace->is_trivial()) {
4304  trace->Flush(compiler, this);
4305  } else {
4306  Label skip_empty_check;
4307  // If we have a minimum number of repetitions we check the current
4308  // number first and skip the empty check if it's not enough.
4309  if (has_minimum) {
4310  int limit = data_.u_empty_match_check.repetition_limit;
4311  assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4312  }
4313  // If the match is empty we bail out, otherwise we fall through
4314  // to the on-success continuation.
4315  assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4316  trace->backtrack());
4317  assembler->Bind(&skip_empty_check);
4318  on_success()->Emit(compiler, trace);
4319  }
4320  break;
4321  }
4323  if (!trace->is_trivial()) {
4324  trace->Flush(compiler, this);
4325  return;
4326  }
4328  data_.u_submatch.current_position_register);
4329  assembler->ReadStackPointerFromRegister(
4330  data_.u_submatch.stack_pointer_register);
4331  int clear_register_count = data_.u_submatch.clear_register_count;
4332  if (clear_register_count == 0) {
4333  on_success()->Emit(compiler, trace);
4334  return;
4335  }
4336  int clear_registers_from = data_.u_submatch.clear_register_from;
4337  Label clear_registers_backtrack;
4338  Trace new_trace = *trace;
4339  new_trace.set_backtrack(&clear_registers_backtrack);
4340  on_success()->Emit(compiler, &new_trace);
4341 
4342  assembler->Bind(&clear_registers_backtrack);
4343  int clear_registers_to = clear_registers_from + clear_register_count - 1;
4344  assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4345 
4346  DCHECK(trace->backtrack() == NULL);
4347  assembler->Backtrack();
4348  return;
4349  }
4350  default:
4351  UNREACHABLE();
4352  }
4353 }
4354 
4355 
4357  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4358  if (!trace->is_trivial()) {
4359  trace->Flush(compiler, this);
4360  return;
4361  }
4362 
4363  LimitResult limit_result = LimitVersions(compiler, trace);
4364  if (limit_result == DONE) return;
4365  DCHECK(limit_result == CONTINUE);
4366 
4367  RecursionCheck rc(compiler);
4368 
4370  if (compiler->ignore_case()) {
4372  trace->backtrack());
4373  } else {
4374  assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4375  }
4376  on_success()->Emit(compiler, trace);
4377 }
4378 
4379 
4380 // -------------------------------------------------------------------
4381 // Dot/dotty output
4382 
4383 
4384 #ifdef DEBUG
4385 
4386 
4387 class DotPrinter: public NodeVisitor {
4388  public:
4389  DotPrinter(OStream& os, bool ignore_case) // NOLINT
4390  : os_(os),
4391  ignore_case_(ignore_case) {}
4392  void PrintNode(const char* label, RegExpNode* node);
4393  void Visit(RegExpNode* node);
4394  void PrintAttributes(RegExpNode* from);
4395  void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4396 #define DECLARE_VISIT(Type) \
4397  virtual void Visit##Type(Type##Node* that);
4399 #undef DECLARE_VISIT
4400  private:
4401  OStream& os_;
4402  bool ignore_case_;
4403 };
4404 
4405 
4406 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4407  os_ << "digraph G {\n graph [label=\"";
4408  for (int i = 0; label[i]; i++) {
4409  switch (label[i]) {
4410  case '\\':
4411  os_ << "\\\\";
4412  break;
4413  case '"':
4414  os_ << "\"";
4415  break;
4416  default:
4417  os_ << label[i];
4418  break;
4419  }
4420  }
4421  os_ << "\"];\n";
4422  Visit(node);
4423  os_ << "}" << endl;
4424 }
4425 
4426 
4427 void DotPrinter::Visit(RegExpNode* node) {
4428  if (node->info()->visited) return;
4429  node->info()->visited = true;
4430  node->Accept(this);
4431 }
4432 
4433 
4434 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4435  os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4436  Visit(on_failure);
4437 }
4438 
4439 
4440 class TableEntryBodyPrinter {
4441  public:
4442  TableEntryBodyPrinter(OStream& os, ChoiceNode* choice) // NOLINT
4443  : os_(os),
4444  choice_(choice) {}
4445  void Call(uc16 from, DispatchTable::Entry entry) {
4446  OutSet* out_set = entry.out_set();
4447  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4448  if (out_set->Get(i)) {
4449  os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4450  << choice()->alternatives()->at(i).node() << ";\n";
4451  }
4452  }
4453  }
4454  private:
4455  ChoiceNode* choice() { return choice_; }
4456  OStream& os_;
4457  ChoiceNode* choice_;
4458 };
4459 
4460 
4461 class TableEntryHeaderPrinter {
4462  public:
4463  explicit TableEntryHeaderPrinter(OStream& os) // NOLINT
4464  : first_(true),
4465  os_(os) {}
4466  void Call(uc16 from, DispatchTable::Entry entry) {
4467  if (first_) {
4468  first_ = false;
4469  } else {
4470  os_ << "|";
4471  }
4472  os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4473  OutSet* out_set = entry.out_set();
4474  int priority = 0;
4475  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4476  if (out_set->Get(i)) {
4477  if (priority > 0) os_ << "|";
4478  os_ << "<s" << from << "o" << i << "> " << priority;
4479  priority++;
4480  }
4481  }
4482  os_ << "}}";
4483  }
4484 
4485  private:
4486  bool first_;
4487  OStream& os_;
4488 };
4489 
4490 
4491 class AttributePrinter {
4492  public:
4493  explicit AttributePrinter(OStream& os) // NOLINT
4494  : os_(os),
4495  first_(true) {}
4496  void PrintSeparator() {
4497  if (first_) {
4498  first_ = false;
4499  } else {
4500  os_ << "|";
4501  }
4502  }
4503  void PrintBit(const char* name, bool value) {
4504  if (!value) return;
4505  PrintSeparator();
4506  os_ << "{" << name << "}";
4507  }
4508  void PrintPositive(const char* name, int value) {
4509  if (value < 0) return;
4510  PrintSeparator();
4511  os_ << "{" << name << "|" << value << "}";
4512  }
4513 
4514  private:
4515  OStream& os_;
4516  bool first_;
4517 };
4518 
4519 
4520 void DotPrinter::PrintAttributes(RegExpNode* that) {
4521  os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4522  << "margin=0.1, fontsize=10, label=\"{";
4523  AttributePrinter printer(os_);
4524  NodeInfo* info = that->info();
4525  printer.PrintBit("NI", info->follows_newline_interest);
4526  printer.PrintBit("WI", info->follows_word_interest);
4527  printer.PrintBit("SI", info->follows_start_interest);
4528  Label* label = that->label();
4529  if (label->is_bound())
4530  printer.PrintPositive("@", label->pos());
4531  os_ << "}\"];\n"
4532  << " a" << that << " -> n" << that
4533  << " [style=dashed, color=grey, arrowhead=none];\n";
4534 }
4535 
4536 
4537 static const bool kPrintDispatchTable = false;
4538 void DotPrinter::VisitChoice(ChoiceNode* that) {
4539  if (kPrintDispatchTable) {
4540  os_ << " n" << that << " [shape=Mrecord, label=\"";
4541  TableEntryHeaderPrinter header_printer(os_);
4542  that->GetTable(ignore_case_)->ForEach(&header_printer);
4543  os_ << "\"]\n";
4544  PrintAttributes(that);
4545  TableEntryBodyPrinter body_printer(os_, that);
4546  that->GetTable(ignore_case_)->ForEach(&body_printer);
4547  } else {
4548  os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4549  for (int i = 0; i < that->alternatives()->length(); i++) {
4550  GuardedAlternative alt = that->alternatives()->at(i);
4551  os_ << " n" << that << " -> n" << alt.node();
4552  }
4553  }
4554  for (int i = 0; i < that->alternatives()->length(); i++) {
4555  GuardedAlternative alt = that->alternatives()->at(i);
4556  alt.node()->Accept(this);
4557  }
4558 }
4559 
4560 
4561 void DotPrinter::VisitText(TextNode* that) {
4562  Zone* zone = that->zone();
4563  os_ << " n" << that << " [label=\"";
4564  for (int i = 0; i < that->elements()->length(); i++) {
4565  if (i > 0) os_ << " ";
4566  TextElement elm = that->elements()->at(i);
4567  switch (elm.text_type()) {
4568  case TextElement::ATOM: {
4569  Vector<const uc16> data = elm.atom()->data();
4570  for (int i = 0; i < data.length(); i++) {
4571  os_ << static_cast<char>(data[i]);
4572  }
4573  break;
4574  }
4575  case TextElement::CHAR_CLASS: {
4576  RegExpCharacterClass* node = elm.char_class();
4577  os_ << "[";
4578  if (node->is_negated()) os_ << "^";
4579  for (int j = 0; j < node->ranges(zone)->length(); j++) {
4580  CharacterRange range = node->ranges(zone)->at(j);
4581  os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4582  }
4583  os_ << "]";
4584  break;
4585  }
4586  default:
4587  UNREACHABLE();
4588  }
4589  }
4590  os_ << "\", shape=box, peripheries=2];\n";
4591  PrintAttributes(that);
4592  os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4593  Visit(that->on_success());
4594 }
4595 
4596 
4597 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4598  os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4599  << that->end_register() << "\", shape=doubleoctagon];\n";
4600  PrintAttributes(that);
4601  os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4602  Visit(that->on_success());
4603 }
4604 
4605 
4606 void DotPrinter::VisitEnd(EndNode* that) {
4607  os_ << " n" << that << " [style=bold, shape=point];\n";
4608  PrintAttributes(that);
4609 }
4610 
4611 
4612 void DotPrinter::VisitAssertion(AssertionNode* that) {
4613  os_ << " n" << that << " [";
4614  switch (that->assertion_type()) {
4615  case AssertionNode::AT_END:
4616  os_ << "label=\"$\", shape=septagon";
4617  break;
4619  os_ << "label=\"^\", shape=septagon";
4620  break;
4622  os_ << "label=\"\\b\", shape=septagon";
4623  break;
4625  os_ << "label=\"\\B\", shape=septagon";
4626  break;
4628  os_ << "label=\"(?<=\\n)\", shape=septagon";
4629  break;
4630  }
4631  os_ << "];\n";
4632  PrintAttributes(that);
4633  RegExpNode* successor = that->on_success();
4634  os_ << " n" << that << " -> n" << successor << ";\n";
4635  Visit(successor);
4636 }
4637 
4638 
4639 void DotPrinter::VisitAction(ActionNode* that) {
4640  os_ << " n" << that << " [";
4641  switch (that->action_type_) {
4643  os_ << "label=\"$" << that->data_.u_store_register.reg
4644  << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4645  break;
4647  os_ << "label=\"$" << that->data_.u_increment_register.reg
4648  << "++\", shape=octagon";
4649  break;
4651  os_ << "label=\"$" << that->data_.u_position_register.reg
4652  << ":=$pos\", shape=octagon";
4653  break;
4655  os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4656  << ":=$pos,begin\", shape=septagon";
4657  break;
4659  os_ << "label=\"escape\", shape=septagon";
4660  break;
4662  os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4663  << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4664  << "<" << that->data_.u_empty_match_check.repetition_limit
4665  << "?\", shape=septagon";
4666  break;
4668  os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4669  << " to $" << that->data_.u_clear_captures.range_to
4670  << "\", shape=septagon";
4671  break;
4672  }
4673  }
4674  os_ << "];\n";
4675  PrintAttributes(that);
4676  RegExpNode* successor = that->on_success();
4677  os_ << " n" << that << " -> n" << successor << ";\n";
4678  Visit(successor);
4679 }
4680 
4681 
4682 class DispatchTableDumper {
4683  public:
4684  explicit DispatchTableDumper(OStream& os) : os_(os) {}
4685  void Call(uc16 key, DispatchTable::Entry entry);
4686  private:
4687  OStream& os_;
4688 };
4689 
4690 
4691 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4692  os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4693  OutSet* set = entry.out_set();
4694  bool first = true;
4695  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4696  if (set->Get(i)) {
4697  if (first) {
4698  first = false;
4699  } else {
4700  os_ << ", ";
4701  }
4702  os_ << i;
4703  }
4704  }
4705  os_ << "}\n";
4706 }
4707 
4708 
4709 void DispatchTable::Dump() {
4710  OFStream os(stderr);
4711  DispatchTableDumper dumper(os);
4712  tree()->ForEach(&dumper);
4713 }
4714 
4715 
4716 void RegExpEngine::DotPrint(const char* label,
4717  RegExpNode* node,
4718  bool ignore_case) {
4719  OFStream os(stdout);
4720  DotPrinter printer(os, ignore_case);
4721  printer.PrintNode(label, node);
4722 }
4723 
4724 
4725 #endif // DEBUG
4726 
4727 
4728 // -------------------------------------------------------------------
4729 // Tree to graph conversion
4730 
4731 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4732  RegExpNode* on_success) {
4733  ZoneList<TextElement>* elms =
4734  new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4735  elms->Add(TextElement::Atom(this), compiler->zone());
4736  return new(compiler->zone()) TextNode(elms, on_success);
4737 }
4738 
4739 
4740 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4741  RegExpNode* on_success) {
4742  return new(compiler->zone()) TextNode(elements(), on_success);
4743 }
4744 
4745 
4747  const int* special_class,
4748  int length) {
4749  length--; // Remove final 0x10000.
4750  DCHECK(special_class[length] == 0x10000);
4751  DCHECK(ranges->length() != 0);
4752  DCHECK(length != 0);
4753  DCHECK(special_class[0] != 0);
4754  if (ranges->length() != (length >> 1) + 1) {
4755  return false;
4756  }
4757  CharacterRange range = ranges->at(0);
4758  if (range.from() != 0) {
4759  return false;
4760  }
4761  for (int i = 0; i < length; i += 2) {
4762  if (special_class[i] != (range.to() + 1)) {
4763  return false;
4764  }
4765  range = ranges->at((i >> 1) + 1);
4766  if (special_class[i+1] != range.from()) {
4767  return false;
4768  }
4769  }
4770  if (range.to() != 0xffff) {
4771  return false;
4772  }
4773  return true;
4774 }
4775 
4776 
4778  const int* special_class,
4779  int length) {
4780  length--; // Remove final 0x10000.
4781  DCHECK(special_class[length] == 0x10000);
4782  if (ranges->length() * 2 != length) {
4783  return false;
4784  }
4785  for (int i = 0; i < length; i += 2) {
4786  CharacterRange range = ranges->at(i >> 1);
4787  if (range.from() != special_class[i] ||
4788  range.to() != special_class[i + 1] - 1) {
4789  return false;
4790  }
4791  }
4792  return true;
4793 }
4794 
4795 
4796 bool RegExpCharacterClass::is_standard(Zone* zone) {
4797  // TODO(lrn): Remove need for this function, by not throwing away information
4798  // along the way.
4799  if (is_negated_) {
4800  return false;
4801  }
4802  if (set_.is_standard()) {
4803  return true;
4804  }
4805  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4806  set_.set_standard_set_type('s');
4807  return true;
4808  }
4809  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4810  set_.set_standard_set_type('S');
4811  return true;
4812  }
4813  if (CompareInverseRanges(set_.ranges(zone),
4816  set_.set_standard_set_type('.');
4817  return true;
4818  }
4819  if (CompareRanges(set_.ranges(zone),
4822  set_.set_standard_set_type('n');
4823  return true;
4824  }
4825  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4826  set_.set_standard_set_type('w');
4827  return true;
4828  }
4829  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4830  set_.set_standard_set_type('W');
4831  return true;
4832  }
4833  return false;
4834 }
4835 
4836 
4837 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4838  RegExpNode* on_success) {
4839  return new(compiler->zone()) TextNode(this, on_success);
4840 }
4841 
4842 
4843 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4844  RegExpNode* on_success) {
4845  ZoneList<RegExpTree*>* alternatives = this->alternatives();
4846  int length = alternatives->length();
4847  ChoiceNode* result =
4848  new(compiler->zone()) ChoiceNode(length, compiler->zone());
4849  for (int i = 0; i < length; i++) {
4850  GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4851  on_success));
4852  result->AddAlternative(alternative);
4853  }
4854  return result;
4855 }
4856 
4857 
4858 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4859  RegExpNode* on_success) {
4860  return ToNode(min(),
4861  max(),
4862  is_greedy(),
4863  body(),
4864  compiler,
4865  on_success);
4866 }
4867 
4868 
4869 // Scoped object to keep track of how much we unroll quantifier loops in the
4870 // regexp graph generator.
4872  public:
4873  static const int kMaxExpansionFactor = 6;
4875  : compiler_(compiler),
4876  saved_expansion_factor_(compiler->current_expansion_factor()),
4878  DCHECK(factor > 0);
4879  if (ok_to_expand_) {
4880  if (factor > kMaxExpansionFactor) {
4881  // Avoid integer overflow of the current expansion factor.
4882  ok_to_expand_ = false;
4884  } else {
4885  int new_factor = saved_expansion_factor_ * factor;
4886  ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4887  compiler->set_current_expansion_factor(new_factor);
4888  }
4889  }
4890  }
4891 
4894  }
4895 
4896  bool ok_to_expand() { return ok_to_expand_; }
4897 
4898  private:
4902 
4904 };
4905 
4906 
4907 RegExpNode* RegExpQuantifier::ToNode(int min,
4908  int max,
4909  bool is_greedy,
4910  RegExpTree* body,
4911  RegExpCompiler* compiler,
4912  RegExpNode* on_success,
4913  bool not_at_start) {
4914  // x{f, t} becomes this:
4915  //
4916  // (r++)<-.
4917  // | `
4918  // | (x)
4919  // v ^
4920  // (r=0)-->(?)---/ [if r < t]
4921  // |
4922  // [if r >= f] \----> ...
4923  //
4924 
4925  // 15.10.2.5 RepeatMatcher algorithm.
4926  // The parser has already eliminated the case where max is 0. In the case
4927  // where max_match is zero the parser has removed the quantifier if min was
4928  // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4929 
4930  // If we know that we cannot match zero length then things are a little
4931  // simpler since we don't need to make the special zero length match check
4932  // from step 2.1. If the min and max are small we can unroll a little in
4933  // this case.
4934  static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
4935  static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
4936  if (max == 0) return on_success; // This can happen due to recursion.
4937  bool body_can_be_empty = (body->min_match() == 0);
4938  int body_start_reg = RegExpCompiler::kNoRegister;
4939  Interval capture_registers = body->CaptureRegisters();
4940  bool needs_capture_clearing = !capture_registers.is_empty();
4941  Zone* zone = compiler->zone();
4942 
4943  if (body_can_be_empty) {
4944  body_start_reg = compiler->AllocateRegister();
4945  } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
4946  // Only unroll if there are no captures and the body can't be
4947  // empty.
4948  {
4949  RegExpExpansionLimiter limiter(
4950  compiler, min + ((max != min) ? 1 : 0));
4951  if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4952  int new_max = (max == kInfinity) ? max : max - min;
4953  // Recurse once to get the loop or optional matches after the fixed
4954  // ones.
4955  RegExpNode* answer = ToNode(
4956  0, new_max, is_greedy, body, compiler, on_success, true);
4957  // Unroll the forced matches from 0 to min. This can cause chains of
4958  // TextNodes (which the parser does not generate). These should be
4959  // combined if it turns out they hinder good code generation.
4960  for (int i = 0; i < min; i++) {
4961  answer = body->ToNode(compiler, answer);
4962  }
4963  return answer;
4964  }
4965  }
4966  if (max <= kMaxUnrolledMaxMatches && min == 0) {
4967  DCHECK(max > 0); // Due to the 'if' above.
4968  RegExpExpansionLimiter limiter(compiler, max);
4969  if (limiter.ok_to_expand()) {
4970  // Unroll the optional matches up to max.
4971  RegExpNode* answer = on_success;
4972  for (int i = 0; i < max; i++) {
4973  ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
4974  if (is_greedy) {
4975  alternation->AddAlternative(
4976  GuardedAlternative(body->ToNode(compiler, answer)));
4977  alternation->AddAlternative(GuardedAlternative(on_success));
4978  } else {
4979  alternation->AddAlternative(GuardedAlternative(on_success));
4980  alternation->AddAlternative(
4981  GuardedAlternative(body->ToNode(compiler, answer)));
4982  }
4983  answer = alternation;
4984  if (not_at_start) alternation->set_not_at_start();
4985  }
4986  return answer;
4987  }
4988  }
4989  }
4990  bool has_min = min > 0;
4991  bool has_max = max < RegExpTree::kInfinity;
4992  bool needs_counter = has_min || has_max;
4993  int reg_ctr = needs_counter
4994  ? compiler->AllocateRegister()
4996  LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4997  zone);
4998  if (not_at_start) center->set_not_at_start();
4999  RegExpNode* loop_return = needs_counter
5000  ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5001  : static_cast<RegExpNode*>(center);
5002  if (body_can_be_empty) {
5003  // If the body can be empty we need to check if it was and then
5004  // backtrack.
5005  loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5006  reg_ctr,
5007  min,
5008  loop_return);
5009  }
5010  RegExpNode* body_node = body->ToNode(compiler, loop_return);
5011  if (body_can_be_empty) {
5012  // If the body can be empty we need to store the start position
5013  // so we can bail out if it was empty.
5014  body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5015  }
5016  if (needs_capture_clearing) {
5017  // Before entering the body of this loop we need to clear captures.
5018  body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5019  }
5020  GuardedAlternative body_alt(body_node);
5021  if (has_max) {
5022  Guard* body_guard =
5023  new(zone) Guard(reg_ctr, Guard::LT, max);
5024  body_alt.AddGuard(body_guard, zone);
5025  }
5026  GuardedAlternative rest_alt(on_success);
5027  if (has_min) {
5028  Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5029  rest_alt.AddGuard(rest_guard, zone);
5030  }
5031  if (is_greedy) {
5032  center->AddLoopAlternative(body_alt);
5033  center->AddContinueAlternative(rest_alt);
5034  } else {
5035  center->AddContinueAlternative(rest_alt);
5036  center->AddLoopAlternative(body_alt);
5037  }
5038  if (needs_counter) {
5039  return ActionNode::SetRegister(reg_ctr, 0, center);
5040  } else {
5041  return center;
5042  }
5043 }
5044 
5045 
5046 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5047  RegExpNode* on_success) {
5048  NodeInfo info;
5049  Zone* zone = compiler->zone();
5050 
5051  switch (assertion_type()) {
5052  case START_OF_LINE:
5053  return AssertionNode::AfterNewline(on_success);
5054  case START_OF_INPUT:
5055  return AssertionNode::AtStart(on_success);
5056  case BOUNDARY:
5057  return AssertionNode::AtBoundary(on_success);
5058  case NON_BOUNDARY:
5059  return AssertionNode::AtNonBoundary(on_success);
5060  case END_OF_INPUT:
5061  return AssertionNode::AtEnd(on_success);
5062  case END_OF_LINE: {
5063  // Compile $ in multiline regexps as an alternation with a positive
5064  // lookahead in one side and an end-of-input on the other side.
5065  // We need two registers for the lookahead.
5066  int stack_pointer_register = compiler->AllocateRegister();
5067  int position_register = compiler->AllocateRegister();
5068  // The ChoiceNode to distinguish between a newline and end-of-input.
5069  ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5070  // Create a newline atom.
5071  ZoneList<CharacterRange>* newline_ranges =
5072  new(zone) ZoneList<CharacterRange>(3, zone);
5073  CharacterRange::AddClassEscape('n', newline_ranges, zone);
5074  RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5075  TextNode* newline_matcher = new(zone) TextNode(
5076  newline_atom,
5077  ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5078  position_register,
5079  0, // No captures inside.
5080  -1, // Ignored if no captures.
5081  on_success));
5082  // Create an end-of-input matcher.
5083  RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5084  stack_pointer_register,
5085  position_register,
5086  newline_matcher);
5087  // Add the two alternatives to the ChoiceNode.
5088  GuardedAlternative eol_alternative(end_of_line);
5089  result->AddAlternative(eol_alternative);
5090  GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5091  result->AddAlternative(end_alternative);
5092  return result;
5093  }
5094  default:
5095  UNREACHABLE();
5096  }
5097  return on_success;
5098 }
5099 
5100 
5101 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5102  RegExpNode* on_success) {
5103  return new(compiler->zone())
5104  BackReferenceNode(RegExpCapture::StartRegister(index()),
5105  RegExpCapture::EndRegister(index()),
5106  on_success);
5107 }
5108 
5109 
5110 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5111  RegExpNode* on_success) {
5112  return on_success;
5113 }
5114 
5115 
5116 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
5117  RegExpNode* on_success) {
5118  int stack_pointer_register = compiler->AllocateRegister();
5119  int position_register = compiler->AllocateRegister();
5120 
5121  const int registers_per_capture = 2;
5122  const int register_of_first_capture = 2;
5123  int register_count = capture_count_ * registers_per_capture;
5124  int register_start =
5125  register_of_first_capture + capture_from_ * registers_per_capture;
5126 
5127  RegExpNode* success;
5128  if (is_positive()) {
5129  RegExpNode* node = ActionNode::BeginSubmatch(
5130  stack_pointer_register,
5131  position_register,
5132  body()->ToNode(
5133  compiler,
5134  ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5135  position_register,
5136  register_count,
5137  register_start,
5138  on_success)));
5139  return node;
5140  } else {
5141  // We use a ChoiceNode for a negative lookahead because it has most of
5142  // the characteristics we need. It has the body of the lookahead as its
5143  // first alternative and the expression after the lookahead of the second
5144  // alternative. If the first alternative succeeds then the
5145  // NegativeSubmatchSuccess will unwind the stack including everything the
5146  // choice node set up and backtrack. If the first alternative fails then
5147  // the second alternative is tried, which is exactly the desired result
5148  // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5149  // ChoiceNode that knows to ignore the first exit when calculating quick
5150  // checks.
5151  Zone* zone = compiler->zone();
5152 
5153  GuardedAlternative body_alt(
5154  body()->ToNode(
5155  compiler,
5156  success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5157  position_register,
5158  register_count,
5159  register_start,
5160  zone)));
5161  ChoiceNode* choice_node =
5162  new(zone) NegativeLookaheadChoiceNode(body_alt,
5163  GuardedAlternative(on_success),
5164  zone);
5165  return ActionNode::BeginSubmatch(stack_pointer_register,
5166  position_register,
5167  choice_node);
5168  }
5169 }
5170 
5171 
5172 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5173  RegExpNode* on_success) {
5174  return ToNode(body(), index(), compiler, on_success);
5175 }
5176 
5177 
5178 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5179  int index,
5180  RegExpCompiler* compiler,
5181  RegExpNode* on_success) {
5182  int start_reg = RegExpCapture::StartRegister(index);
5183  int end_reg = RegExpCapture::EndRegister(index);
5184  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5185  RegExpNode* body_node = body->ToNode(compiler, store_end);
5186  return ActionNode::StorePosition(start_reg, true, body_node);
5187 }
5188 
5189 
5190 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5191  RegExpNode* on_success) {
5192  ZoneList<RegExpTree*>* children = nodes();
5193  RegExpNode* current = on_success;
5194  for (int i = children->length() - 1; i >= 0; i--) {
5195  current = children->at(i)->ToNode(compiler, current);
5196  }
5197  return current;
5198 }
5199 
5200 
5201 static void AddClass(const int* elmv,
5202  int elmc,
5203  ZoneList<CharacterRange>* ranges,
5204  Zone* zone) {
5205  elmc--;
5206  DCHECK(elmv[elmc] == 0x10000);
5207  for (int i = 0; i < elmc; i += 2) {
5208  DCHECK(elmv[i] < elmv[i + 1]);
5209  ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5210  }
5211 }
5212 
5213 
5214 static void AddClassNegated(const int *elmv,
5215  int elmc,
5216  ZoneList<CharacterRange>* ranges,
5217  Zone* zone) {
5218  elmc--;
5219  DCHECK(elmv[elmc] == 0x10000);
5220  DCHECK(elmv[0] != 0x0000);
5221  DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
5222  uc16 last = 0x0000;
5223  for (int i = 0; i < elmc; i += 2) {
5224  DCHECK(last <= elmv[i] - 1);
5225  DCHECK(elmv[i] < elmv[i + 1]);
5226  ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5227  last = elmv[i + 1];
5228  }
5229  ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
5230 }
5231 
5232 
5234  ZoneList<CharacterRange>* ranges,
5235  Zone* zone) {
5236  switch (type) {
5237  case 's':
5238  AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5239  break;
5240  case 'S':
5242  break;
5243  case 'w':
5244  AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5245  break;
5246  case 'W':
5248  break;
5249  case 'd':
5250  AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5251  break;
5252  case 'D':
5254  break;
5255  case '.':
5258  ranges,
5259  zone);
5260  break;
5261  // This is not a character range as defined by the spec but a
5262  // convenient shorthand for a character class that matches any
5263  // character.
5264  case '*':
5265  ranges->Add(CharacterRange::Everything(), zone);
5266  break;
5267  // This is the set of characters matched by the $ and ^ symbols
5268  // in multiline mode.
5269  case 'n':
5272  ranges,
5273  zone);
5274  break;
5275  default:
5276  UNREACHABLE();
5277  }
5278 }
5279 
5280 
5283 }
5284 
5285 
5287  public:
5289  ZoneList<CharacterRange>** excluded,
5290  Zone* zone)
5291  : included_(included),
5292  excluded_(excluded),
5293  zone_(zone) { }
5294  void Call(uc16 from, DispatchTable::Entry entry);
5295 
5296  static const int kInBase = 0;
5297  static const int kInOverlay = 1;
5298 
5299  private:
5303 };
5304 
5305 
5307  if (!entry.out_set()->Get(kInBase)) return;
5308  ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5309  ? included_
5310  : excluded_;
5311  if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5312  (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5313 }
5314 
5315 
5317  Vector<const int> overlay,
5318  ZoneList<CharacterRange>** included,
5319  ZoneList<CharacterRange>** excluded,
5320  Zone* zone) {
5321  DCHECK_EQ(NULL, *included);
5322  DCHECK_EQ(NULL, *excluded);
5323  DispatchTable table(zone);
5324  for (int i = 0; i < base->length(); i++)
5325  table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5326  for (int i = 0; i < overlay.length(); i += 2) {
5327  table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5329  }
5330  CharacterRangeSplitter callback(included, excluded, zone);
5331  table.ForEach(&callback);
5332 }
5333 
5334 
5336  bool is_one_byte, Zone* zone) {
5337  Isolate* isolate = zone->isolate();
5338  uc16 bottom = from();
5339  uc16 top = to();
5340  if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
5341  if (bottom > String::kMaxOneByteCharCode) return;
5343  }
5345  if (top == bottom) {
5346  // If this is a singleton we just expand the one character.
5347  int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5348  for (int i = 0; i < length; i++) {
5349  uc32 chr = chars[i];
5350  if (chr != bottom) {
5351  ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5352  }
5353  }
5354  } else {
5355  // If this is a range we expand the characters block by block,
5356  // expanding contiguous subranges (blocks) one at a time.
5357  // The approach is as follows. For a given start character we
5358  // look up the remainder of the block that contains it (represented
5359  // by the end point), for instance we find 'z' if the character
5360  // is 'c'. A block is characterized by the property
5361  // that all characters uncanonicalize in the same way, except that
5362  // each entry in the result is incremented by the distance from the first
5363  // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5364  // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5365  // Once we've found the end point we look up its uncanonicalization
5366  // and produce a range for each element. For instance for [c-f]
5367  // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
5368  // add a range if it is not already contained in the input, so [c-f]
5369  // will be skipped but [C-F] will be added. If this range is not
5370  // completely contained in a block we do this for all the blocks
5371  // covered by the range (handling characters that is not in a block
5372  // as a "singleton block").
5374  int pos = bottom;
5375  while (pos <= top) {
5376  int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5377  uc16 block_end;
5378  if (length == 0) {
5379  block_end = pos;
5380  } else {
5381  DCHECK_EQ(1, length);
5382  block_end = range[0];
5383  }
5384  int end = (block_end > top) ? top : block_end;
5385  length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5386  for (int i = 0; i < length; i++) {
5387  uc32 c = range[i];
5388  uc16 range_from = c - (block_end - pos);
5389  uc16 range_to = c - (block_end - end);
5390  if (!(bottom <= range_from && range_to <= top)) {
5391  ranges->Add(CharacterRange(range_from, range_to), zone);
5392  }
5393  }
5394  pos = end + 1;
5395  }
5396  }
5397 }
5398 
5399 
5401  DCHECK_NOT_NULL(ranges);
5402  int n = ranges->length();
5403  if (n <= 1) return true;
5404  int max = ranges->at(0).to();
5405  for (int i = 1; i < n; i++) {
5406  CharacterRange next_range = ranges->at(i);
5407  if (next_range.from() <= max + 1) return false;
5408  max = next_range.to();
5409  }
5410  return true;
5411 }
5412 
5413 
5414 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5415  if (ranges_ == NULL) {
5416  ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5417  CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5418  }
5419  return ranges_;
5420 }
5421 
5422 
5423 // Move a number of elements in a zonelist to another position
5424 // in the same list. Handles overlapping source and target areas.
5426  int from,
5427  int to,
5428  int count) {
5429  // Ranges are potentially overlapping.
5430  if (from < to) {
5431  for (int i = count - 1; i >= 0; i--) {
5432  list->at(to + i) = list->at(from + i);
5433  }
5434  } else {
5435  for (int i = 0; i < count; i++) {
5436  list->at(to + i) = list->at(from + i);
5437  }
5438  }
5439 }
5440 
5441 
5443  int count,
5444  CharacterRange insert) {
5445  // Inserts a range into list[0..count[, which must be sorted
5446  // by from value and non-overlapping and non-adjacent, using at most
5447  // list[0..count] for the result. Returns the number of resulting
5448  // canonicalized ranges. Inserting a range may collapse existing ranges into
5449  // fewer ranges, so the return value can be anything in the range 1..count+1.
5450  uc16 from = insert.from();
5451  uc16 to = insert.to();
5452  int start_pos = 0;
5453  int end_pos = count;
5454  for (int i = count - 1; i >= 0; i--) {
5455  CharacterRange current = list->at(i);
5456  if (current.from() > to + 1) {
5457  end_pos = i;
5458  } else if (current.to() + 1 < from) {
5459  start_pos = i + 1;
5460  break;
5461  }
5462  }
5463 
5464  // Inserted range overlaps, or is adjacent to, ranges at positions
5465  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5466  // not affected by the insertion.
5467  // If start_pos == end_pos, the range must be inserted before start_pos.
5468  // if start_pos < end_pos, the entire range from start_pos to end_pos
5469  // must be merged with the insert range.
5470 
5471  if (start_pos == end_pos) {
5472  // Insert between existing ranges at position start_pos.
5473  if (start_pos < count) {
5474  MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5475  }
5476  list->at(start_pos) = insert;
5477  return count + 1;
5478  }
5479  if (start_pos + 1 == end_pos) {
5480  // Replace single existing range at position start_pos.
5481  CharacterRange to_replace = list->at(start_pos);
5482  int new_from = Min(to_replace.from(), from);
5483  int new_to = Max(to_replace.to(), to);
5484  list->at(start_pos) = CharacterRange(new_from, new_to);
5485  return count;
5486  }
5487  // Replace a number of existing ranges from start_pos to end_pos - 1.
5488  // Move the remaining ranges down.
5489 
5490  int new_from = Min(list->at(start_pos).from(), from);
5491  int new_to = Max(list->at(end_pos - 1).to(), to);
5492  if (end_pos < count) {
5493  MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5494  }
5495  list->at(start_pos) = CharacterRange(new_from, new_to);
5496  return count - (end_pos - start_pos) + 1;
5497 }
5498 
5499 
5501  // Special/default classes are always considered canonical. The result
5502  // of calling ranges() will be sorted.
5503  if (ranges_ == NULL) return;
5505 }
5506 
5507 
5509  if (character_ranges->length() <= 1) return;
5510  // Check whether ranges are already canonical (increasing, non-overlapping,
5511  // non-adjacent).
5512  int n = character_ranges->length();
5513  int max = character_ranges->at(0).to();
5514  int i = 1;
5515  while (i < n) {
5516  CharacterRange current = character_ranges->at(i);
5517  if (current.from() <= max + 1) {
5518  break;
5519  }
5520  max = current.to();
5521  i++;
5522  }
5523  // Canonical until the i'th range. If that's all of them, we are done.
5524  if (i == n) return;
5525 
5526  // The ranges at index i and forward are not canonicalized. Make them so by
5527  // doing the equivalent of insertion sort (inserting each into the previous
5528  // list, in order).
5529  // Notice that inserting a range can reduce the number of ranges in the
5530  // result due to combining of adjacent and overlapping ranges.
5531  int read = i; // Range to insert.
5532  int num_canonical = i; // Length of canonicalized part of list.
5533  do {
5534  num_canonical = InsertRangeInCanonicalList(character_ranges,
5535  num_canonical,
5536  character_ranges->at(read));
5537  read++;
5538  } while (read < n);
5539  character_ranges->Rewind(num_canonical);
5540 
5541  DCHECK(CharacterRange::IsCanonical(character_ranges));
5542 }
5543 
5544 
5546  ZoneList<CharacterRange>* negated_ranges,
5547  Zone* zone) {
5549  DCHECK_EQ(0, negated_ranges->length());
5550  int range_count = ranges->length();
5551  uc16 from = 0;
5552  int i = 0;
5553  if (range_count > 0 && ranges->at(0).from() == 0) {
5554  from = ranges->at(0).to();
5555  i = 1;
5556  }
5557  while (i < range_count) {
5558  CharacterRange range = ranges->at(i);
5559  negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
5560  from = range.to();
5561  i++;
5562  }
5564  negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5565  zone);
5566  }
5567 }
5568 
5569 
5570 // -------------------------------------------------------------------
5571 // Splay tree
5572 
5573 
5574 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
5575  if (Get(value))
5576  return this;
5577  if (successors(zone) != NULL) {
5578  for (int i = 0; i < successors(zone)->length(); i++) {
5579  OutSet* successor = successors(zone)->at(i);
5580  if (successor->Get(value))
5581  return successor;
5582  }
5583  } else {
5584  successors_ = new(zone) ZoneList<OutSet*>(2, zone);
5585  }
5586  OutSet* result = new(zone) OutSet(first_, remaining_);
5587  result->Set(value, zone);
5588  successors(zone)->Add(result, zone);
5589  return result;
5590 }
5591 
5592 
5593 void OutSet::Set(unsigned value, Zone *zone) {
5594  if (value < kFirstLimit) {
5595  first_ |= (1 << value);
5596  } else {
5597  if (remaining_ == NULL)
5598  remaining_ = new(zone) ZoneList<unsigned>(1, zone);
5599  if (remaining_->is_empty() || !remaining_->Contains(value))
5600  remaining_->Add(value, zone);
5601  }
5602 }
5603 
5604 
5605 bool OutSet::Get(unsigned value) const {
5606  if (value < kFirstLimit) {
5607  return (first_ & (1 << value)) != 0;
5608  } else if (remaining_ == NULL) {
5609  return false;
5610  } else {
5611  return remaining_->Contains(value);
5612  }
5613 }
5614 
5615 
5617 
5618 
5619 void DispatchTable::AddRange(CharacterRange full_range, int value,
5620  Zone* zone) {
5621  CharacterRange current = full_range;
5622  if (tree()->is_empty()) {
5623  // If this is the first range we just insert into the table.
5625  DCHECK_RESULT(tree()->Insert(current.from(), &loc));
5626  loc.set_value(Entry(current.from(), current.to(),
5627  empty()->Extend(value, zone)));
5628  return;
5629  }
5630  // First see if there is a range to the left of this one that
5631  // overlaps.
5633  if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5634  Entry* entry = &loc.value();
5635  // If we've found a range that overlaps with this one, and it
5636  // starts strictly to the left of this one, we have to fix it
5637  // because the following code only handles ranges that start on
5638  // or after the start point of the range we're adding.
5639  if (entry->from() < current.from() && entry->to() >= current.from()) {
5640  // Snap the overlapping range in half around the start point of
5641  // the range we're adding.
5642  CharacterRange left(entry->from(), current.from() - 1);
5643  CharacterRange right(current.from(), entry->to());
5644  // The left part of the overlapping range doesn't overlap.
5645  // Truncate the whole entry to be just the left part.
5646  entry->set_to(left.to());
5647  // The right part is the one that overlaps. We add this part
5648  // to the map and let the next step deal with merging it with
5649  // the range we're adding.
5651  DCHECK_RESULT(tree()->Insert(right.from(), &loc));
5652  loc.set_value(Entry(right.from(),
5653  right.to(),
5654  entry->out_set()));
5655  }
5656  }
5657  while (current.is_valid()) {
5658  if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5659  (loc.value().from() <= current.to()) &&
5660  (loc.value().to() >= current.from())) {
5661  Entry* entry = &loc.value();
5662  // We have overlap. If there is space between the start point of
5663  // the range we're adding and where the overlapping range starts
5664  // then we have to add a range covering just that space.
5665  if (current.from() < entry->from()) {
5667  DCHECK_RESULT(tree()->Insert(current.from(), &ins));
5668  ins.set_value(Entry(current.from(),
5669  entry->from() - 1,
5670  empty()->Extend(value, zone)));
5671  current.set_from(entry->from());
5672  }
5673  DCHECK_EQ(current.from(), entry->from());
5674  // If the overlapping range extends beyond the one we want to add
5675  // we have to snap the right part off and add it separately.
5676  if (entry->to() > current.to()) {
5678  DCHECK_RESULT(tree()->Insert(current.to() + 1, &ins));
5679  ins.set_value(Entry(current.to() + 1,
5680  entry->to(),
5681  entry->out_set()));
5682  entry->set_to(current.to());
5683  }
5684  DCHECK(entry->to() <= current.to());
5685  // The overlapping range is now completely contained by the range
5686  // we're adding so we can just update it and move the start point
5687  // of the range we're adding just past it.
5688  entry->AddValue(value, zone);
5689  // Bail out if the last interval ended at 0xFFFF since otherwise
5690  // adding 1 will wrap around to 0.
5691  if (entry->to() == String::kMaxUtf16CodeUnit)
5692  break;
5693  DCHECK(entry->to() + 1 > current.from());
5694  current.set_from(entry->to() + 1);
5695  } else {
5696  // There is no overlap so we can just add the range
5698  DCHECK_RESULT(tree()->Insert(current.from(), &ins));
5699  ins.set_value(Entry(current.from(),
5700  current.to(),
5701  empty()->Extend(value, zone)));
5702  break;
5703  }
5704  }
5705 }
5706 
5707 
5710  if (!tree()->FindGreatestLessThan(value, &loc))
5711  return empty();
5712  Entry* entry = &loc.value();
5713  if (value <= entry->to())
5714  return entry->out_set();
5715  else
5716  return empty();
5717 }
5718 
5719 
5720 // -------------------------------------------------------------------
5721 // Analysis
5722 
5723 
5725  StackLimitCheck check(that->zone()->isolate());
5726  if (check.HasOverflowed()) {
5727  fail("Stack overflow");
5728  return;
5729  }
5730  if (that->info()->been_analyzed || that->info()->being_analyzed)
5731  return;
5732  that->info()->being_analyzed = true;
5733  that->Accept(this);
5734  that->info()->being_analyzed = false;
5735  that->info()->been_analyzed = true;
5736 }
5737 
5738 
5739 void Analysis::VisitEnd(EndNode* that) {
5740  // nothing to do
5741 }
5742 
5743 
5745  int element_count = elements()->length();
5746  // Set up the offsets of the elements relative to the start. This is a fixed
5747  // quantity since a TextNode can only contain fixed-width things.
5748  int cp_offset = 0;
5749  for (int i = 0; i < element_count; i++) {
5750  TextElement& elm = elements()->at(i);
5751  elm.set_cp_offset(cp_offset);
5752  cp_offset += elm.length();
5753  }
5754 }
5755 
5756 
5757 void Analysis::VisitText(TextNode* that) {
5758  if (ignore_case_) {
5760  }
5761  EnsureAnalyzed(that->on_success());
5762  if (!has_failed()) {
5763  that->CalculateOffsets();
5764  }
5765 }
5766 
5767 
5768 void Analysis::VisitAction(ActionNode* that) {
5769  RegExpNode* target = that->on_success();
5770  EnsureAnalyzed(target);
5771  if (!has_failed()) {
5772  // If the next node is interested in what it follows then this node
5773  // has to be interested too so it can pass the information on.
5774  that->info()->AddFromFollowing(target->info());
5775  }
5776 }
5777 
5778 
5779 void Analysis::VisitChoice(ChoiceNode* that) {
5780  NodeInfo* info = that->info();
5781  for (int i = 0; i < that->alternatives()->length(); i++) {
5782  RegExpNode* node = that->alternatives()->at(i).node();
5783  EnsureAnalyzed(node);
5784  if (has_failed()) return;
5785  // Anything the following nodes need to know has to be known by
5786  // this node also, so it can pass it on.
5787  info->AddFromFollowing(node->info());
5788  }
5789 }
5790 
5791 
5793  NodeInfo* info = that->info();
5794  for (int i = 0; i < that->alternatives()->length(); i++) {
5795  RegExpNode* node = that->alternatives()->at(i).node();
5796  if (node != that->loop_node()) {
5797  EnsureAnalyzed(node);
5798  if (has_failed()) return;
5799  info->AddFromFollowing(node->info());
5800  }
5801  }
5802  // Check the loop last since it may need the value of this node
5803  // to get a correct result.
5804  EnsureAnalyzed(that->loop_node());
5805  if (!has_failed()) {
5806  info->AddFromFollowing(that->loop_node()->info());
5807  }
5808 }
5809 
5810 
5811 void Analysis::VisitBackReference(BackReferenceNode* that) {
5812  EnsureAnalyzed(that->on_success());
5813 }
5814 
5815 
5816 void Analysis::VisitAssertion(AssertionNode* that) {
5817  EnsureAnalyzed(that->on_success());
5818 }
5819 
5820 
5822  int budget,
5823  BoyerMooreLookahead* bm,
5824  bool not_at_start) {
5825  // Working out the set of characters that a backreference can match is too
5826  // hard, so we just say that any character can match.
5827  bm->SetRest(offset);
5828  SaveBMInfo(bm, not_at_start, offset);
5829 }
5830 
5831 
5834 
5835 
5837  int budget,
5838  BoyerMooreLookahead* bm,
5839  bool not_at_start) {
5841  budget = (budget - 1) / alts->length();
5842  for (int i = 0; i < alts->length(); i++) {
5843  GuardedAlternative& alt = alts->at(i);
5844  if (alt.guards() != NULL && alt.guards()->length() != 0) {
5845  bm->SetRest(offset); // Give up trying to fill in info.
5846  SaveBMInfo(bm, not_at_start, offset);
5847  return;
5848  }
5849  alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5850  }
5851  SaveBMInfo(bm, not_at_start, offset);
5852 }
5853 
5854 
5855 void TextNode::FillInBMInfo(int initial_offset,
5856  int budget,
5857  BoyerMooreLookahead* bm,
5858  bool not_at_start) {
5859  if (initial_offset >= bm->length()) return;
5860  int offset = initial_offset;
5861  int max_char = bm->max_char();
5862  for (int i = 0; i < elements()->length(); i++) {
5863  if (offset >= bm->length()) {
5864  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5865  return;
5866  }
5867  TextElement text = elements()->at(i);
5868  if (text.text_type() == TextElement::ATOM) {
5869  RegExpAtom* atom = text.atom();
5870  for (int j = 0; j < atom->length(); j++, offset++) {
5871  if (offset >= bm->length()) {
5872  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5873  return;
5874  }
5875  uc16 character = atom->data()[j];
5876  if (bm->compiler()->ignore_case()) {
5878  int length = GetCaseIndependentLetters(
5879  Isolate::Current(),
5880  character,
5882  chars);
5883  for (int j = 0; j < length; j++) {
5884  bm->Set(offset, chars[j]);
5885  }
5886  } else {
5887  if (character <= max_char) bm->Set(offset, character);
5888  }
5889  }
5890  } else {
5891  DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
5892  RegExpCharacterClass* char_class = text.char_class();
5893  ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
5894  if (char_class->is_negated()) {
5895  bm->SetAll(offset);
5896  } else {
5897  for (int k = 0; k < ranges->length(); k++) {
5898  CharacterRange& range = ranges->at(k);
5899  if (range.from() > max_char) continue;
5900  int to = Min(max_char, static_cast<int>(range.to()));
5901  bm->SetInterval(offset, Interval(range.from(), to));
5902  }
5903  }
5904  offset++;
5905  }
5906  }
5907  if (offset >= bm->length()) {
5908  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5909  return;
5910  }
5911  on_success()->FillInBMInfo(offset,
5912  budget - 1,
5913  bm,
5914  true); // Not at start after a text node.
5915  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5916 }
5917 
5918 
5919 // -------------------------------------------------------------------
5920 // Dispatch table construction
5921 
5922 
5923 void DispatchTableConstructor::VisitEnd(EndNode* that) {
5925 }
5926 
5927 
5929  node->set_being_calculated(true);
5930  ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5931  for (int i = 0; i < alternatives->length(); i++) {
5933  alternatives->at(i).node()->Accept(this);
5934  }
5935  node->set_being_calculated(false);
5936 }
5937 
5938 
5940  public:
5942  : constructor_(constructor) { }
5943  void Call(uc32 from, DispatchTable::Entry entry);
5944  private:
5946 };
5947 
5948 
5950  CharacterRange range(from, entry.to());
5951  constructor_->AddRange(range);
5952 }
5953 
5954 
5955 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5956  if (node->being_calculated())
5957  return;
5959  AddDispatchRange adder(this);
5960  table->ForEach(&adder);
5961 }
5962 
5963 
5964 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5965  // TODO(160): Find the node that we refer back to and propagate its start
5966  // set back to here. For now we just accept anything.
5968 }
5969 
5970 
5971 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5972  RegExpNode* target = that->on_success();
5973  target->Accept(this);
5974 }
5975 
5976 
5978  const CharacterRange* b) {
5979  return Compare<uc16>(a->from(), b->from());
5980 }
5981 
5982 
5984  ranges->Sort(CompareRangeByFrom);
5985  uc16 last = 0;
5986  for (int i = 0; i < ranges->length(); i++) {
5987  CharacterRange range = ranges->at(i);
5988  if (last < range.from())
5989  AddRange(CharacterRange(last, range.from() - 1));
5990  if (range.to() >= last) {
5991  if (range.to() == String::kMaxUtf16CodeUnit) {
5992  return;
5993  } else {
5994  last = range.to() + 1;
5995  }
5996  }
5997  }
5999 }
6000 
6001 
6002 void DispatchTableConstructor::VisitText(TextNode* that) {
6003  TextElement elm = that->elements()->at(0);
6004  switch (elm.text_type()) {
6005  case TextElement::ATOM: {
6006  uc16 c = elm.atom()->data()[0];
6007  AddRange(CharacterRange(c, c));
6008  break;
6009  }
6010  case TextElement::CHAR_CLASS: {
6011  RegExpCharacterClass* tree = elm.char_class();
6012  ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6013  if (tree->is_negated()) {
6014  AddInverse(ranges);
6015  } else {
6016  for (int i = 0; i < ranges->length(); i++)
6017  AddRange(ranges->at(i));
6018  }
6019  break;
6020  }
6021  default: {
6022  UNIMPLEMENTED();
6023  }
6024  }
6025 }
6026 
6027 
6028 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6029  RegExpNode* target = that->on_success();
6030  target->Accept(this);
6031 }
6032 
6033 
6035  RegExpCompileData* data, bool ignore_case, bool is_global,
6036  bool is_multiline, bool is_sticky, Handle<String> pattern,
6037  Handle<String> sample_subject, bool is_one_byte, Zone* zone) {
6038  if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6039  return IrregexpRegExpTooBig(zone->isolate());
6040  }
6041  RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte, zone);
6042 
6043  // Sample some characters from the middle of the string.
6044  static const int kSampleSize = 128;
6045 
6046  sample_subject = String::Flatten(sample_subject);
6047  int chars_sampled = 0;
6048  int half_way = (sample_subject->length() - kSampleSize) / 2;
6049  for (int i = Max(0, half_way);
6050  i < sample_subject->length() && chars_sampled < kSampleSize;
6051  i++, chars_sampled++) {
6052  compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6053  }
6054 
6055  // Wrap the body of the regexp in capture #0.
6056  RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6057  0,
6058  &compiler,
6059  compiler.accept());
6060  RegExpNode* node = captured_body;
6061  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6062  bool is_start_anchored = data->tree->IsAnchoredAtStart();
6063  int max_length = data->tree->max_match();
6064  if (!is_start_anchored && !is_sticky) {
6065  // Add a .*? at the beginning, outside the body capture, unless
6066  // this expression is anchored at the beginning or sticky.
6067  RegExpNode* loop_node =
6068  RegExpQuantifier::ToNode(0,
6070  false,
6071  new(zone) RegExpCharacterClass('*'),
6072  &compiler,
6073  captured_body,
6074  data->contains_anchor);
6075 
6076  if (data->contains_anchor) {
6077  // Unroll loop once, to take care of the case that might start
6078  // at the start of input.
6079  ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6080  first_step_node->AddAlternative(GuardedAlternative(captured_body));
6081  first_step_node->AddAlternative(GuardedAlternative(
6082  new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
6083  node = first_step_node;
6084  } else {
6085  node = loop_node;
6086  }
6087  }
6088  if (is_one_byte) {
6089  node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6090  // Do it again to propagate the new nodes to places where they were not
6091  // put because they had not been calculated yet.
6092  if (node != NULL) {
6093  node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6094  }
6095  }
6096 
6097  if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6098  data->node = node;
6099  Analysis analysis(ignore_case, is_one_byte);
6100  analysis.EnsureAnalyzed(node);
6101  if (analysis.has_failed()) {
6102  const char* error_message = analysis.error_message();
6103  return CompilationResult(zone->isolate(), error_message);
6104  }
6105 
6106  // Create the correct assembler for the architecture.
6107 #ifndef V8_INTERPRETED_REGEXP
6108  // Native regexp implementation.
6109 
6113 
6114 #if V8_TARGET_ARCH_IA32
6115  RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2,
6116  zone);
6117 #elif V8_TARGET_ARCH_X64
6118  RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2,
6119  zone);
6120 #elif V8_TARGET_ARCH_ARM
6121  RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2,
6122  zone);
6123 #elif V8_TARGET_ARCH_ARM64
6124  RegExpMacroAssemblerARM64 macro_assembler(mode, (data->capture_count + 1) * 2,
6125  zone);
6126 #elif V8_TARGET_ARCH_MIPS
6127  RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6128  zone);
6129 #elif V8_TARGET_ARCH_MIPS64
6130  RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6131  zone);
6132 #elif V8_TARGET_ARCH_X87
6133  RegExpMacroAssemblerX87 macro_assembler(mode, (data->capture_count + 1) * 2,
6134  zone);
6135 #else
6136 #error "Unsupported architecture"
6137 #endif
6138 
6139 #else // V8_INTERPRETED_REGEXP
6140  // Interpreted regexp implementation.
6142  RegExpMacroAssemblerIrregexp macro_assembler(codes, zone);
6143 #endif // V8_INTERPRETED_REGEXP
6144 
6145  // Inserted here, instead of in Assembler, because it depends on information
6146  // in the AST that isn't replicated in the Node structure.
6147  static const int kMaxBacksearchLimit = 1024;
6148  if (is_end_anchored &&
6149  !is_start_anchored &&
6150  max_length < kMaxBacksearchLimit) {
6151  macro_assembler.SetCurrentPositionFromEnd(max_length);
6152  }
6153 
6154  if (is_global) {
6155  macro_assembler.set_global_mode(
6156  (data->tree->min_match() > 0)
6159  }
6160 
6161  return compiler.Assemble(&macro_assembler,
6162  node,
6163  data->capture_count,
6164  pattern);
6165 }
6166 
6167 
6168 }} // namespace v8::internal
#define DECLARE_VISIT(type)
static uint16_t ConvertNonLatin1ToLatin1(uint16_t)
Definition: unicode-inl.h:60
int get(uchar c, uchar n, uchar *result)
Definition: unicode-inl.h:27
static const uchar kBadChar
Definition: unicode.h:139
virtual void FillInBMInfo(int offset, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:2284
struct v8::internal::ActionNode::@22::@28 u_clear_captures
struct v8::internal::ActionNode::@22::@23 u_store_register
static ActionNode * EmptyMatchCheck(int start_register, int repetition_register, int repetition_limit, RegExpNode *on_success)
Definition: jsregexp.cc:1538
union v8::internal::ActionNode::@22 data_
static ActionNode * StorePosition(int reg, bool is_capture, RegExpNode *on_success)
Definition: jsregexp.cc:1491
static ActionNode * BeginSubmatch(int stack_pointer_reg, int position_reg, RegExpNode *on_success)
Definition: jsregexp.cc:1512
static ActionNode * PositiveSubmatchSuccess(int stack_pointer_reg, int restore_reg, int clear_capture_count, int clear_capture_from, RegExpNode *on_success)
Definition: jsregexp.cc:1523
ActionType action_type_
Definition: jsregexp.h:825
static ActionNode * SetRegister(int reg, int val, RegExpNode *on_success)
Definition: jsregexp.cc:1472
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start)
Definition: jsregexp.cc:2273
static ActionNode * IncrementRegister(int reg, RegExpNode *on_success)
Definition: jsregexp.cc:1483
struct v8::internal::ActionNode::@22::@25 u_position_register
struct v8::internal::ActionNode::@22::@24 u_increment_register
struct v8::internal::ActionNode::@22::@27 u_empty_match_check
static ActionNode * ClearCaptures(Interval range, RegExpNode *on_success)
Definition: jsregexp.cc:1502
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:4234
struct v8::internal::ActionNode::@22::@26 u_submatch
void Call(uc32 from, DispatchTable::Entry entry)
Definition: jsregexp.cc:5949
AddDispatchRange(DispatchTableConstructor *constructor)
Definition: jsregexp.cc:5941
DispatchTableConstructor * constructor_
Definition: jsregexp.cc:5945
AlternativeGeneration * at(int i)
Definition: jsregexp.cc:3563
AlternativeGenerationList(int count, Zone *zone)
Definition: jsregexp.cc:3547
ZoneList< AlternativeGeneration * > alt_gens_
Definition: jsregexp.cc:3569
QuickCheckDetails quick_check_details
Definition: jsregexp.cc:3539
const char * error_message()
Definition: jsregexp.h:1614
void EnsureAnalyzed(RegExpNode *node)
Definition: jsregexp.cc:5724
virtual void VisitLoopChoice(LoopChoiceNode *that)
Definition: jsregexp.cc:5792
void fail(const char *error_message)
Definition: jsregexp.h:1618
virtual void FillInBMInfo(int offset, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:2313
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start)
Definition: jsregexp.cc:2297
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3145
void EmitBoundaryCheck(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3047
static AssertionNode * AtEnd(RegExpNode *on_success)
Definition: jsregexp.h:892
static AssertionNode * AfterNewline(RegExpNode *on_success)
Definition: jsregexp.h:904
static AssertionNode * AtBoundary(RegExpNode *on_success)
Definition: jsregexp.h:898
static AssertionNode * AtStart(RegExpNode *on_success)
Definition: jsregexp.h:895
void BacktrackIfPrevious(RegExpCompiler *compiler, Trace *trace, IfPrevious backtrack_if_previous)
Definition: jsregexp.cc:3098
static AssertionNode * AtNonBoundary(RegExpNode *on_success)
Definition: jsregexp.h:901
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int filled_in, bool not_at_start)
Definition: jsregexp.cc:3130
virtual void FillInBMInfo(int offset, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:5821
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:4356
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)
Definition: jsregexp.cc:2324
int GetSkipTable(int min_lookahead, int max_lookahead, Handle< ByteArray > boolean_skip_table)
Definition: jsregexp.cc:3727
void SetInterval(int map_number, const Interval &interval)
Definition: jsregexp.h:1306
ZoneList< BoyerMoorePositionInfo * > * bitmaps_
Definition: jsregexp.h:1334
void SetAll(int map_number)
Definition: jsregexp.h:1316
int Count(int map_number)
Definition: jsregexp.h:1294
BoyerMoorePositionInfo * at(int i)
Definition: jsregexp.h:1298
void SetRest(int from_map)
Definition: jsregexp.h:1320
RegExpCompiler * compiler()
Definition: jsregexp.h:1292
BoyerMooreLookahead(int length, RegExpCompiler *compiler, Zone *zone)
Definition: jsregexp.cc:3633
bool FindWorthwhileInterval(int *from, int *to)
Definition: jsregexp.cc:3652
int FindBestInterval(int max_number_of_chars, int old_biggest_points, int *from, int *to)
Definition: jsregexp.cc:3674
void EmitSkipInstructions(RegExpMacroAssembler *masm)
Definition: jsregexp.cc:3754
void Set(int map_number, int character)
Definition: jsregexp.h:1300
void SetInterval(const Interval &interval)
Definition: jsregexp.cc:3600
void Call(uc16 from, DispatchTable::Entry entry)
Definition: jsregexp.cc:5306
ZoneList< CharacterRange > ** excluded_
Definition: jsregexp.cc:5301
CharacterRangeSplitter(ZoneList< CharacterRange > **included, ZoneList< CharacterRange > **excluded, Zone *zone)
Definition: jsregexp.cc:5288
ZoneList< CharacterRange > ** included_
Definition: jsregexp.cc:5300
static void Split(ZoneList< CharacterRange > *base, Vector< const int > overlay, ZoneList< CharacterRange > **included, ZoneList< CharacterRange > **excluded, Zone *zone)
Definition: jsregexp.cc:5316
static void Canonicalize(ZoneList< CharacterRange > *ranges)
Definition: jsregexp.cc:5508
void set_from(uc16 value)
Definition: jsregexp.h:260
void AddCaseEquivalents(ZoneList< CharacterRange > *ranges, bool is_one_byte, Zone *zone)
Definition: jsregexp.cc:5335
static Vector< const int > GetWordBounds()
Definition: jsregexp.cc:5281
static CharacterRange Singleton(uc16 value)
Definition: jsregexp.h:248
static void Negate(ZoneList< CharacterRange > *src, ZoneList< CharacterRange > *dst, Zone *zone)
Definition: jsregexp.cc:5545
static bool IsCanonical(ZoneList< CharacterRange > *ranges)
Definition: jsregexp.cc:5400
static CharacterRange Everything()
Definition: jsregexp.h:255
static void AddClassEscape(uc16 type, ZoneList< CharacterRange > *ranges, Zone *zone)
Definition: jsregexp.cc:5233
int EmitOptimizedUnanchoredSearch(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:4052
DispatchTable * GetTable(bool ignore_case)
Definition: jsregexp.cc:936
ZoneList< GuardedAlternative > * alternatives()
Definition: jsregexp.h:1064
ZoneList< GuardedAlternative > * alternatives_
Definition: jsregexp.h:1092
void AddAlternative(GuardedAlternative node)
Definition: jsregexp.h:1061
int GreedyLoopTextLengthForAlternative(GuardedAlternative *alternative)
Definition: jsregexp.cc:3446
int CalculatePreloadCharacters(RegExpCompiler *compiler, int eats_at_least)
Definition: jsregexp.cc:3506
Trace * EmitGreedyLoop(RegExpCompiler *compiler, Trace *trace, AlternativeGenerationList *alt_gens, PreloadState *preloads, GreedyLoopState *greedy_loop_state, int text_length)
Definition: jsregexp.cc:4005
void set_being_calculated(bool b)
Definition: jsregexp.h:1084
virtual void FillInBMInfo(int offset, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:5836
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)
Definition: jsregexp.cc:2962
virtual bool try_to_emit_quick_check_for_alternative(bool is_first)
Definition: jsregexp.h:1085
void EmitChoices(RegExpCompiler *compiler, AlternativeGenerationList *alt_gens, int first_choice, Trace *trace, PreloadState *preloads)
Definition: jsregexp.cc:4103
void EmitOutOfLineContinuation(RegExpCompiler *compiler, Trace *trace, GuardedAlternative alternative, AlternativeGeneration *alt_gen, int preload_characters, bool next_expects_preload)
Definition: jsregexp.cc:4192
void SetUpPreLoad(RegExpCompiler *compiler, Trace *current_trace, PreloadState *preloads)
Definition: jsregexp.cc:3915
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start)
Definition: jsregexp.cc:2400
virtual RegExpNode * FilterOneByte(int depth, bool ignore_case)
Definition: jsregexp.cc:2861
void AssertGuardsMentionRegisters(Trace *trace)
Definition: jsregexp.cc:3900
void GenerateGuard(RegExpMacroAssembler *macro_assembler, Guard *guard, Trace *trace)
Definition: jsregexp.cc:1568
int EatsAtLeastHelper(int still_to_find, int budget, RegExpNode *ignore_this_node, bool not_at_start)
Definition: jsregexp.cc:2370
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3933
MaybeHandle< FixedArray > LookupRegExp(Handle< String > source, JSRegExp::Flags flags)
void PutRegExp(Handle< String > source, JSRegExp::Flags flags, Handle< FixedArray > data)
void AddRange(CharacterRange range)
Definition: jsregexp.h:1565
void BuildTable(ChoiceNode *node)
Definition: jsregexp.cc:5928
void AddInverse(ZoneList< CharacterRange > *ranges)
Definition: jsregexp.cc:5983
void AddValue(int value, Zone *zone)
Definition: jsregexp.h:337
void AddRange(CharacterRange range, int value, Zone *zone)
Definition: jsregexp.cc:5619
OutSet * Get(uc16 value)
Definition: jsregexp.cc:5708
void ForEach(Callback *callback)
Definition: jsregexp.h:368
ZoneSplayTree< Config > * tree()
Definition: jsregexp.h:377
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:1441
Object * get(int index)
Definition: objects-inl.h:2165
void set(int index, Object *value)
Definition: objects-inl.h:2190
int Frequency(int in_character)
Definition: jsregexp.cc:962
void CountCharacter(int character)
Definition: jsregexp.cc:954
GreedyLoopState(bool not_at_start)
Definition: jsregexp.cc:3894
void AddGuard(Guard *guard, Zone *zone)
Definition: jsregexp.cc:1465
ZoneList< Guard * > * guards()
Definition: jsregexp.h:1040
static Handle< T > cast(Handle< S > that)
Definition: handles.h:116
Isolate * GetIsolate() const
Definition: objects-inl.h:1387
Isolate * isolate()
Definition: heap-inl.h:589
void IncreaseTotalRegexpCodeGenerated(int size)
Definition: heap.h:1172
double total_regexp_code_generated()
Definition: heap.h:1171
int from() const
Definition: jsregexp.h:713
bool Contains(int value)
Definition: jsregexp.h:709
static RegExpImpl::IrregexpResult Match(Isolate *isolate, Handle< ByteArray > code, Handle< String > subject, int *captures, int start_position)
unibrow::Mapping< unibrow::CanonicalizationRange > * jsregexp_canonrange()
Definition: isolate.h:929
MemoryAllocator * memory_allocator()
Definition: isolate.h:883
Object * Throw(Object *exception, MessageLocation *location=NULL)
Definition: isolate.cc:832
CodeTracer * GetCodeTracer()
Definition: isolate.cc:2154
unibrow::Mapping< unibrow::Ecma262UnCanonicalize > * jsregexp_uncanonicalize()
Definition: isolate.h:925
CompilationCache * compilation_cache()
Definition: isolate.h:865
static const int kJSRegexpStaticOffsetsVectorSize
Definition: isolate.h:984
Object * StackOverflow()
Definition: isolate.cc:773
Factory * factory()
Definition: isolate.h:982
bool has_pending_exception()
Definition: isolate.h:581
static void EnsureSize(Handle< JSArray > array, int minimum_size_of_backing_fixed_array)
Definition: objects-inl.h:6955
static const int kUninitializedValue
Definition: objects.h:7829
static const int kIrregexpCaptureCountIndex
Definition: objects.h:7806
static const int kCompilationErrorValue
Definition: objects.h:7833
static const int kAtomPatternIndex
Definition: objects.h:7782
static const int kIrregexpMaxRegisterCountIndex
Definition: objects.h:7804
static int saved_code_index(bool is_latin1)
Definition: objects.h:7758
static int code_index(bool is_latin1)
Definition: objects.h:7750
void Add(const T &element, AllocationPolicy allocator=AllocationPolicy())
Definition: list-inl.h:17
T & at(int i) const
Definition: list.h:69
void Sort(int(*cmp)(const T *x, const T *y))
Definition: list-inl.h:194
bool Contains(const T &elm) const
Definition: list-inl.h:174
void AddContinueAlternative(GuardedAlternative alt)
Definition: jsregexp.cc:3476
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)
Definition: jsregexp.cc:2935
void AddLoopAlternative(GuardedAlternative alt)
Definition: jsregexp.cc:3469
virtual void FillInBMInfo(int offset, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:2948
virtual void Accept(NodeVisitor *visitor)
Definition: jsregexp.cc:1559
RegExpNode * loop_node()
Definition: jsregexp.h:1185
virtual RegExpNode * FilterOneByte(int depth, bool ignore_case)
Definition: jsregexp.cc:2843
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start)
Definition: jsregexp.cc:2390
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3483
static Result Match(Handle< Code > regexp, Handle< String > subject, int *offsets_vector, int offsets_vector_length, int previous_index, Isolate *isolate)
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start)
Definition: jsregexp.cc:2347
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)
Definition: jsregexp.cc:2358
virtual RegExpNode * FilterOneByte(int depth, bool ignore_case)
Definition: jsregexp.cc:2912
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:1414
virtual void VisitLoopChoice(LoopChoiceNode *that)
Definition: jsregexp.h:1548
void Set(unsigned value, Zone *zone)
Definition: jsregexp.cc:5593
OutSet * Extend(unsigned value, Zone *zone)
Definition: jsregexp.cc:5574
ZoneList< unsigned > * remaining_
Definition: jsregexp.h:317
ZoneList< OutSet * > * successors_
Definition: jsregexp.h:318
bool Get(unsigned value) const
Definition: jsregexp.cc:5605
static const unsigned kFirstLimit
Definition: jsregexp.h:301
ZoneList< OutSet * > * successors(Zone *zone)
Definition: jsregexp.h:312
Position * positions(int index)
Definition: jsregexp.h:546
void Merge(QuickCheckDetails *other, int from_index)
Definition: jsregexp.cc:2712
void set_characters(int characters)
Definition: jsregexp.h:545
bool Rationalize(bool one_byte)
Definition: jsregexp.cc:2421
void Advance(int by, bool one_byte)
Definition: jsregexp.cc:2691
RecursionCheck(RegExpCompiler *compiler)
Definition: jsregexp.cc:1057
RegExpCompiler * compiler_
Definition: jsregexp.cc:1062
static const int kMaxRecursion
Definition: jsregexp.cc:1020
FrequencyCollator frequency_collator_
Definition: jsregexp.cc:1050
static const int kNoRegister
Definition: jsregexp.cc:1038
void set_current_expansion_factor(int value)
Definition: jsregexp.cc:1032
RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler *assembler, RegExpNode *start, int capture_count, Handle< String > pattern)
Definition: jsregexp.cc:1089
RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte, Zone *zone)
Definition: jsregexp.cc:1073
RegExpMacroAssembler * macro_assembler()
Definition: jsregexp.cc:1017
List< RegExpNode * > * work_list_
Definition: jsregexp.cc:1043
RegExpMacroAssembler * macro_assembler_
Definition: jsregexp.cc:1045
FrequencyCollator * frequency_collator()
Definition: jsregexp.cc:1029
void AddWork(RegExpNode *node)
Definition: jsregexp.cc:1011
static CompilationResult Compile(RegExpCompileData *input, bool ignore_case, bool global, bool multiline, bool sticky, Handle< String > pattern, Handle< String > sample_subject, bool is_one_byte, Zone *zone)
Definition: jsregexp.cc:6034
static void DotPrint(const char *label, RegExpNode *node, bool ignore_case)
DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter)
RegExpExpansionLimiter(RegExpCompiler *compiler, int factor)
Definition: jsregexp.cc:4874
GlobalCache(Handle< JSRegExp > regexp, Handle< String > subject, bool is_global, Isolate *isolate)
Definition: jsregexp.cc:685
static const int kRegWxpCompiledLimit
Definition: jsregexp.h:216
static MUST_USE_RESULT MaybeHandle< Object > Exec(Handle< JSRegExp > regexp, Handle< String > subject, int index, Handle< JSArray > lastMatchInfo)
Definition: jsregexp.cc:225
static MUST_USE_RESULT MaybeHandle< Object > IrregexpExec(Handle< JSRegExp > regexp, Handle< String > subject, int index, Handle< JSArray > lastMatchInfo)
Definition: jsregexp.cc:614
static void SetLastCaptureCount(FixedArray *array, int to)
Definition: jsregexp.h:182
static void SetIrregexpMaxRegisterCount(FixedArray *re, int value)
Definition: jsregexp.cc:465
static bool CompileIrregexp(Handle< JSRegExp > re, Handle< String > sample_subject, bool is_one_byte)
Definition: jsregexp.cc:389
static void SetLastSubject(FixedArray *array, String *to)
Definition: jsregexp.h:186
static Handle< Object > AtomExec(Handle< JSRegExp > regexp, Handle< String > subject, int index, Handle< JSArray > lastMatchInfo)
Definition: jsregexp.cc:322
static void AtomCompile(Handle< JSRegExp > re, Handle< String > pattern, JSRegExp::Flags flags, Handle< String > match_pattern)
Definition: jsregexp.cc:245
static const int kLastMatchOverhead
Definition: jsregexp.h:165
static void SetCapture(FixedArray *array, int index, int to)
Definition: jsregexp.h:194
static ByteArray * IrregexpByteCode(FixedArray *re, bool is_one_byte)
Definition: jsregexp.cc:480
static Handle< JSArray > SetLastMatchInfo(Handle< JSArray > last_match_info, Handle< String > subject, int capture_count, int32_t *match)
Definition: jsregexp.cc:662
static int AtomExecRaw(Handle< JSRegExp > regexp, Handle< String > subject, int index, int32_t *output, int output_size)
Definition: jsregexp.cc:270
static void IrregexpInitialize(Handle< JSRegExp > re, Handle< String > pattern, JSRegExp::Flags flags, int capture_register_count)
Definition: jsregexp.cc:490
static int IrregexpMaxRegisterCount(FixedArray *re)
Definition: jsregexp.cc:459
static const int kRegExpExecutableMemoryLimit
Definition: jsregexp.h:215
static MUST_USE_RESULT MaybeHandle< Object > CreateRegExpLiteral(Handle< JSFunction > constructor, Handle< String > pattern, Handle< String > flags)
Definition: jsregexp.cc:50
static void SetLastInput(FixedArray *array, String *to)
Definition: jsregexp.h:190
static int IrregexpPrepare(Handle< JSRegExp > regexp, Handle< String > subject)
Definition: jsregexp.cc:503
static int IrregexpExecRaw(Handle< JSRegExp > regexp, Handle< String > subject, int index, int32_t *output, int output_size)
Definition: jsregexp.cc:526
static Code * IrregexpNativeCode(FixedArray *re, bool is_one_byte)
Definition: jsregexp.cc:485
static int IrregexpNumberOfRegisters(FixedArray *re)
Definition: jsregexp.cc:475
static bool EnsureCompiledIrregexp(Handle< JSRegExp > re, Handle< String > sample_subject, bool is_one_byte)
Definition: jsregexp.cc:352
static int IrregexpNumberOfCaptures(FixedArray *re)
Definition: jsregexp.cc:470
static MUST_USE_RESULT MaybeHandle< Object > Compile(Handle< JSRegExp > re, Handle< String > pattern, Handle< String > flags)
Definition: jsregexp.cc:156
virtual void SetCurrentPositionFromEnd(int by)
virtual void LoadCurrentCharacter(int cp_offset, Label *on_end_of_input, bool check_bounds=true, int characters=1)=0
virtual void CheckBitInTable(Handle< ByteArray > table, Label *on_bit_set)=0
virtual void PushRegister(int register_index, StackCheckFlag check_stack_limit)=0
virtual void Bind(Label *label)=0
virtual void PushBacktrack(Label *label)=0
virtual void IfRegisterEqPos(int reg, Label *if_eq)=0
virtual void CheckPosition(int cp_offset, Label *on_outside_input)
virtual void SetRegister(int register_index, int to)=0
virtual bool CheckSpecialCharacterClass(uc16 type, Label *on_no_match)
virtual void CheckAtStart(Label *on_at_start)=0
virtual void CheckCharacterNotInRange(uc16 from, uc16 to, Label *on_not_in_range)=0
virtual void CheckCharacterAfterAnd(unsigned c, unsigned and_with, Label *on_equal)=0
virtual void CheckCharacterLT(uc16 limit, Label *on_less)=0
virtual void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 and_with, Label *on_not_equal)=0
virtual void CheckNotBackReference(int start_reg, Label *on_no_match)=0
virtual void CheckCharacter(unsigned c, Label *on_equal)=0
virtual void ReadCurrentPositionFromRegister(int reg)=0
virtual void CheckCharacterGT(uc16 limit, Label *on_greater)=0
virtual void CheckCharacterInRange(uc16 from, uc16 to, Label *on_in_range)=0
virtual void IfRegisterLT(int reg, int comparand, Label *if_lt)=0
virtual void ReadStackPointerFromRegister(int reg)=0
virtual void WriteStackPointerToRegister(int reg)=0
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset)=0
virtual void CheckGreedyLoop(Label *on_tos_equals_current_position)=0
virtual Handle< HeapObject > GetCode(Handle< String > source)=0
virtual void AdvanceCurrentPosition(int by)=0
virtual void PopRegister(int register_index)=0
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, Label *on_no_match)=0
virtual void CheckNotCharacter(unsigned c, Label *on_not_equal)=0
virtual void GoTo(Label *label)=0
virtual void AdvanceRegister(int reg, int by)=0
virtual void ClearRegisters(int reg_from, int reg_to)=0
virtual void IfRegisterGE(int reg, int comparand, Label *if_ge)=0
virtual void CheckNotAtStart(Label *on_not_at_start)=0
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, Label *on_not_equal)=0
BoyerMooreLookahead * bm_info(bool not_at_start)
Definition: jsregexp.h:664
Zone * zone() const
Definition: jsregexp.h:668
void set_bm_info(bool not_at_start, BoyerMooreLookahead *bm)
Definition: jsregexp.h:676
virtual RegExpNode * GetSuccessorOfOmnivorousTextNode(RegExpCompiler *compiler)
Definition: jsregexp.h:611
LimitResult LimitVersions(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:2229
void SaveBMInfo(BoyerMooreLookahead *bm, bool not_at_start, int offset)
Definition: jsregexp.h:650
bool EmitQuickCheck(RegExpCompiler *compiler, Trace *bounds_check_trace, Trace *trace, bool preload_has_checked_bounds, Label *on_possible_success, QuickCheckDetails *details_return, bool fall_through_on_failure)
Definition: jsregexp.cc:2445
virtual void FillInBMInfo(int offset, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.h:622
virtual int GreedyLoopTextLength()
Definition: jsregexp.h:608
virtual void Emit(RegExpCompiler *compiler, Trace *trace)=0
virtual void Accept(NodeVisitor *visitor)=0
static const int kRecursionBudget
Definition: jsregexp.h:621
static const int kNodeIsTooComplexForGreedyLoops
Definition: jsregexp.h:607
virtual RegExpNode * FilterOneByte(int depth, bool ignore_case)
Definition: jsregexp.h:632
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)=0
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start)=0
virtual bool IsAnchoredAtStart()
Definition: ast.h:2603
virtual bool IsAnchoredAtEnd()
Definition: ast.h:2604
virtual int min_match()=0
virtual void AppendToText(RegExpText *text, Zone *zone)
Definition: jsregexp.cc:892
static const int kInfinity
Definition: ast.h:2597
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)=0
virtual Interval CaptureRegisters()
Definition: ast.h:2609
virtual int max_match()=0
virtual RegExpNode * FilterOneByte(int depth, bool ignore_case)
Definition: jsregexp.cc:2755
RegExpNode * FilterSuccessor(int depth, bool ignore_case)
Definition: jsregexp.cc:2764
RegExpNode * on_success()
Definition: jsregexp.h:727
static Smi * FromInt(int value)
Definition: objects-inl.h:1321
Vector< const uint8_t > ToOneByteVector()
Definition: objects.h:8639
Vector< const uc16 > ToUC16Vector()
Definition: objects.h:8645
FlatContent GetFlatContent()
Definition: objects.cc:7961
static const uint32_t kMaxOneByteCharCodeU
Definition: objects.h:8812
static const int32_t kMaxOneByteCharCode
Definition: objects.h:8811
static const int kMaxUtf16CodeUnit
Definition: objects.h:8813
SmartArrayPointer< char > ToCString(AllowNullsFlag allow_nulls, RobustnessFlag robustness_flag, int offset, int length, int *length_output=0)
Definition: objects.cc:8004
static Handle< String > Flatten(Handle< String > string, PretenureFlag pretenure=NOT_TENURED)
Definition: objects-inl.h:3354
static bool SkipPass(int pass, bool ignore_case)
Definition: jsregexp.cc:3299
virtual int GreedyLoopTextLength()
Definition: jsregexp.cc:3412
virtual RegExpNode * GetSuccessorOfOmnivorousTextNode(RegExpCompiler *compiler)
Definition: jsregexp.cc:3418
virtual void FillInBMInfo(int offset, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:5855
virtual RegExpNode * FilterOneByte(int depth, bool ignore_case)
Definition: jsregexp.cc:2789
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)
Definition: jsregexp.cc:2527
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start)
Definition: jsregexp.cc:2334
void MakeCaseIndependent(bool is_one_byte)
Definition: jsregexp.cc:3393
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3315
void TextEmitPass(RegExpCompiler *compiler, TextEmitPassType pass, bool preloaded, Trace *trace, bool first_element_checked, int *checked_up_to)
Definition: jsregexp.cc:3225
ZoneList< TextElement > * elements()
Definition: jsregexp.h:849
ActionNode::ActionType action_type()
Definition: jsregexp.h:1371
void set_stop_node(RegExpNode *node)
Definition: jsregexp.h:1480
int characters_preloaded()
Definition: jsregexp.h:1463
void Flush(RegExpCompiler *compiler, RegExpNode *successor)
Definition: jsregexp.cc:1353
int FindAffectedRegisters(OutSet *affected_registers, Zone *zone)
Definition: jsregexp.cc:1182
RegExpNode * stop_node()
Definition: jsregexp.h:1462
int bound_checked_up_to()
Definition: jsregexp.h:1464
DeferredAction * actions_
Definition: jsregexp.h:1504
TriBool at_start()
Definition: jsregexp.h:1456
Label * loop_label()
Definition: jsregexp.h:1461
void RestoreAffectedRegisters(RegExpMacroAssembler *macro, int max_register, const OutSet &registers_to_pop, const OutSet &registers_to_clear)
Definition: jsregexp.cc:1202
DeferredAction * actions()
Definition: jsregexp.h:1436
void AdvanceCurrentPositionInTrace(int by, RegExpCompiler *compiler)
Definition: jsregexp.cc:3374
void set_characters_preloaded(int count)
Definition: jsregexp.h:1482
void add_action(DeferredAction *new_action)
Definition: jsregexp.h:1474
void set_at_start(bool at_start)
Definition: jsregexp.h:1457
void set_quick_check_performed(QuickCheckDetails *d)
Definition: jsregexp.h:1485
void PerformDeferredActions(RegExpMacroAssembler *macro, int max_register, const OutSet &affected_registers, OutSet *registers_to_pop, OutSet *registers_to_clear, Zone *zone)
Definition: jsregexp.cc:1220
Label * backtrack()
Definition: jsregexp.h:1460
bool mentions_reg(int reg)
Definition: jsregexp.cc:1153
bool GetStoredPosition(int reg, int *cp_offset)
Definition: jsregexp.cc:1164
void set_backtrack(Label *backtrack)
Definition: jsregexp.h:1479
void set_bound_checked_up_to(int to)
Definition: jsregexp.h:1483
QuickCheckDetails quick_check_performed_
Definition: jsregexp.h:1510
void set_loop_label(Label *label)
Definition: jsregexp.h:1481
void set_flush_budget(int to)
Definition: jsregexp.h:1484
void InvalidateCurrentCharacter()
Definition: jsregexp.cc:3369
QuickCheckDetails * quick_check_performed()
Definition: jsregexp.h:1466
T * start() const
Definition: vector.h:47
int length() const
Definition: vector.h:41
VisitMarker(NodeInfo *info)
Definition: jsregexp.cc:2743
Isolate * isolate() const
Definition: zone.h:68
enable harmony numeric enable harmony object literal extensions Optimize object Array DOM strings and string trace pretenuring decisions of HAllocate instructions Enables optimizations which favor memory size over execution speed maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining trace the tracking of allocation sites deoptimize every n garbage collections perform array bounds checks elimination analyze liveness of environment slots and zap dead values flushes the cache of optimized code for closures on every GC allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms do not emit check maps for constant values that have a leaf map
enable harmony numeric enable harmony object literal extensions Optimize object Array DOM strings and string trace pretenuring decisions of HAllocate instructions Enables optimizations which favor memory size over execution speed maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining trace the tracking of allocation sites deoptimize every n garbage collections perform array bounds checks elimination analyze liveness of environment slots and zap dead values flushes the cache of optimized code for closures on every GC allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms do not emit check maps for constant values that have a leaf deoptimize the optimized code if the layout of the maps changes enable context specialization in TurboFan execution budget before interrupt is triggered max percentage of megamorphic generic ICs to allow optimization enable use of SAHF instruction if enable use of VFP3 instructions if available enable use of NEON instructions if enable use of SDIV and UDIV instructions if enable use of MLS instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of d16 d31 registers on ARM this requires VFP3 force all emitted branches to be in long enable alignment of csp to bytes on platforms which prefer the register to always be expose gc extension under the specified name show built in functions in stack traces use random jit cookie to mask large constants minimum length for automatic enable preparsing CPU profiler sampling interval in microseconds trace out of bounds accesses to external arrays default size of stack region v8 is allowed to maximum length of function source code printed in a stack trace min size of a semi the new space consists of two semi spaces print one trace line following each garbage collection do not print trace line after scavenger collection print cumulative GC statistics in only print modified registers Trace simulator debug messages Implied by trace sim abort randomize hashes to avoid predictable hash Fixed seed to use to hash property Print the time it takes to deserialize the snapshot A filename with extra code to be included in the A file to write the raw snapshot bytes to(mksnapshot only)") DEFINE_STRING(raw_context_file
enable harmony numeric enable harmony object literal extensions Optimize object Array DOM strings and string trace pretenuring decisions of HAllocate instructions Enables optimizations which favor memory size over execution speed maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining trace the tracking of allocation sites deoptimize every n garbage collections perform array bounds checks elimination analyze liveness of environment slots and zap dead values flushes the cache of optimized code for closures on every GC allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms do not emit check maps for constant values that have a leaf deoptimize the optimized code if the layout of the maps changes enable context specialization in TurboFan execution budget before interrupt is triggered max percentage of megamorphic generic ICs to allow optimization enable use of SAHF instruction if enable use of VFP3 instructions if available enable use of NEON instructions if enable use of SDIV and UDIV instructions if enable use of MLS instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of d16 d31 registers on ARM this requires VFP3 force all emitted branches to be in long enable alignment of csp to bytes on platforms which prefer the register to always be expose gc extension under the specified name show built in functions in stack traces use random jit cookie to mask large constants minimum length for automatic enable preparsing CPU profiler sampling interval in microseconds trace out of bounds accesses to external arrays default size of stack region v8 is allowed to maximum length of function source code printed in a stack trace min size of a semi the new space consists of two semi spaces print one trace line following each garbage collection do not print trace line after scavenger collection print cumulative GC statistics in name
enable harmony numeric enable harmony object literal extensions true
enable harmony numeric enable harmony object literal extensions Optimize object Array DOM strings and string trace pretenuring decisions of HAllocate instructions Enables optimizations which favor memory size over execution speed maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining trace the tracking of allocation sites deoptimize every n garbage collections perform array bounds checks elimination analyze liveness of environment slots and zap dead values flushes the cache of optimized code for closures on every GC allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms do not emit check maps for constant values that have a leaf deoptimize the optimized code if the layout of the maps changes enable context specialization in TurboFan execution budget before interrupt is triggered max percentage of megamorphic generic ICs to allow optimization enable use of SAHF instruction if enable use of VFP3 instructions if available enable use of NEON instructions if enable use of SDIV and UDIV instructions if enable use of MLS instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of d16 d31 registers on ARM this requires VFP3 force all emitted branches to be in long mode(MIPS only)") DEFINE_BOOL(enable_always_align_csp
enable harmony numeric enable harmony object literal extensions Optimize object Array DOM strings and string trace pretenuring decisions of HAllocate instructions Enables optimizations which favor memory size over execution speed maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining trace the tracking of allocation sites deoptimize every n garbage collections perform array bounds checks elimination analyze liveness of environment slots and zap dead values flushes the cache of optimized code for closures on every GC allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms do not emit check maps for constant values that have a leaf deoptimize the optimized code if the layout of the maps changes enable context specialization in TurboFan execution budget before interrupt is triggered max percentage of megamorphic generic ICs to allow optimization enable use of SAHF instruction if enable use of VFP3 instructions if available enable use of NEON instructions if enable use of SDIV and UDIV instructions if enable use of MLS instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of d16 d31 registers on ARM this requires VFP3 force all emitted branches to be in long enable alignment of csp to bytes on platforms which prefer the register to always be NULL
#define ASSIGN_RETURN_ON_EXCEPTION(isolate, dst, call, T)
Definition: isolate.h:135
#define THROW_NEW_ERROR(isolate, call, T)
Definition: isolate.h:138
#define DEFINE_ACCEPT(Type)
Definition: jsregexp.cc:1551
#define FOR_EACH_NODE_TYPE(VISIT)
Definition: jsregexp.h:382
#define LOG(isolate, Call)
Definition: log.h:69
#define UNREACHABLE()
Definition: logging.h:30
#define DCHECK_LE(v1, v2)
Definition: logging.h:210
#define DCHECK_NOT_NULL(p)
Definition: logging.h:213
#define DCHECK_RESULT(expr)
Definition: logging.h:204
#define DCHECK_GE(v1, v2)
Definition: logging.h:208
#define UNIMPLEMENTED()
Definition: logging.h:28
#define DCHECK(condition)
Definition: logging.h:205
#define DCHECK_LT(v1, v2)
Definition: logging.h:209
#define DCHECK_EQ(v1, v2)
Definition: logging.h:206
void USE(T)
Definition: macros.h:322
#define arraysize(array)
Definition: macros.h:86
#define MUST_USE_RESULT
Definition: macros.h:266
unsigned short uint16_t
Definition: unicode.cc:23
unsigned int uchar
Definition: unicode.h:17
int int32_t
Definition: unicode.cc:24
Vector< const char > CStrVector(const char *data)
Definition: vector.h:158
int SearchString(Isolate *isolate, Vector< const SubjectChar > subject, Vector< const PatternChar > pattern, int start_index)
const int kPatternTooShortForBoyerMoore
Definition: jsregexp.cc:127
static const int kLineTerminatorRanges[]
Definition: jsregexp.cc:3590
static bool CompareInverseRanges(ZoneList< CharacterRange > *ranges, const int *special_class, int length)
Definition: jsregexp.cc:4746
static const int kSurrogateRanges[]
Definition: jsregexp.cc:3588
static void MoveRanges(ZoneList< CharacterRange > *list, int from, int to, int count)
Definition: jsregexp.cc:5425
static void EmitCharClass(RegExpMacroAssembler *macro_assembler, RegExpCharacterClass *cc, bool one_byte, Label *on_failure, int cp_offset, bool check_offset, bool preloaded, Zone *zone)
Definition: jsregexp.cc:2115
static void EmitDoubleBoundaryTest(RegExpMacroAssembler *masm, int first, int last, Label *fall_through, Label *in_range, Label *out_of_range)
Definition: jsregexp.cc:1778
static int min(int a, int b)
Definition: liveedit.cc:273
static LifetimePosition Min(LifetimePosition a, LifetimePosition b)
static int InsertRangeInCanonicalList(ZoneList< CharacterRange > *list, int count, CharacterRange insert)
Definition: jsregexp.cc:5442
static void EmitUseLookupTable(RegExpMacroAssembler *masm, ZoneList< int > *ranges, int start_index, int end_index, int min_char, Label *fall_through, Label *even_label, Label *odd_label)
Definition: jsregexp.cc:1803
static int GetCaseIndependentLetters(Isolate *isolate, uc16 character, bool one_byte_subject, unibrow::uchar *letters)
Definition: jsregexp.cc:1590
static const int kWordRanges[]
Definition: jsregexp.cc:3583
static const int kLineTerminatorRangeCount
Definition: jsregexp.cc:3592
ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b)
Definition: jsregexp.h:1238
static bool EmitAtomLetter(Isolate *isolate, RegExpCompiler *compiler, uc16 c, Label *on_failure, int cp_offset, bool check, bool preloaded)
Definition: jsregexp.cc:1717
static bool DeterminedAlready(QuickCheckDetails *quick_check, int offset)
Definition: jsregexp.cc:3182
static void GenerateBranches(RegExpMacroAssembler *masm, ZoneList< int > *ranges, int start_index, int end_index, uc16 min_char, uc16 max_char, Label *fall_through, Label *even_label, Label *odd_label)
Definition: jsregexp.cc:1963
OStream & endl(OStream &os)
Definition: ostreams.cc:112
static uint32_t SmearBitsRight(uint32_t v)
Definition: jsregexp.cc:2411
static void SplitSearchSpace(ZoneList< int > *ranges, int start_index, int end_index, int *new_start_index, int *new_end_index, int *border)
Definition: jsregexp.cc:1893
static void CreateRegExpErrorObjectAndThrow(Handle< JSRegExp > re, Handle< String > error_message, Isolate *isolate)
Definition: jsregexp.cc:374
static void EmitBoundaryTest(RegExpMacroAssembler *masm, int border, Label *fall_through, Label *above_or_equal, Label *below)
Definition: jsregexp.cc:1764
static bool HasFewDifferentCharacters(Handle< String > pattern)
Definition: jsregexp.cc:132
static void EmitWordCheck(RegExpMacroAssembler *assembler, Label *word, Label *non_word, bool fall_through_on_word)
Definition: jsregexp.cc:2986
static const int kSpaceRanges[]
Definition: jsregexp.cc:3577
const Register pc
static void UpdateBoundsCheck(int index, int *checked_up_to)
Definition: jsregexp.cc:3189
const int kNoRegister
Definition: constants-arm.h:43
static LifetimePosition Max(LifetimePosition a, LifetimePosition b)
static bool EmitAtomNonLetter(Isolate *isolate, RegExpCompiler *compiler, uc16 c, Label *on_failure, int cp_offset, bool check, bool preloaded)
Definition: jsregexp.cc:1636
static bool RangesContainLatin1Equivalents(ZoneList< CharacterRange > *ranges)
Definition: jsregexp.cc:2780
static const int kSpaceRangeCount
Definition: jsregexp.cc:3581
void PrintF(const char *format,...)
Definition: utils.cc:80
const int kMaxLookaheadForBoyerMoore
Definition: jsregexp.cc:124
static bool RangeContainsLatin1Equivalents(CharacterRange range)
Definition: jsregexp.cc:2773
static void AddClassNegated(const int *elmv, int elmc, ZoneList< CharacterRange > *ranges, Zone *zone)
Definition: jsregexp.cc:5214
static int CompareRangeByFrom(const CharacterRange *a, const CharacterRange *b)
Definition: jsregexp.cc:5977
static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate *isolate)
Definition: jsregexp.cc:1066
static bool EmitSimpleCharacter(Isolate *isolate, RegExpCompiler *compiler, uc16 c, Label *on_failure, int cp_offset, bool check, bool preloaded)
Definition: jsregexp.cc:1613
bool EmitCharacterFunction(Isolate *isolate, RegExpCompiler *compiler, uc16 c, Label *on_failure, int cp_offset, bool check, bool preloaded)
Definition: jsregexp.cc:1707
unibrow::Mapping< unibrow::Ecma262Canonicalize > Canonicalize
static const int kDigitRanges[]
Definition: jsregexp.cc:3586
static const int kWordRangeCount
Definition: jsregexp.cc:3585
STATIC_ASSERT(sizeof(CPURegister)==sizeof(Register))
static JSRegExp::Flags RegExpFlagsFromString(Handle< String > str)
Definition: jsregexp.cc:60
uint16_t uc16
Definition: globals.h:184
static void CutOutRange(RegExpMacroAssembler *masm, ZoneList< int > *ranges, int start_index, int end_index, int cut_index, Label *even_label, Label *odd_label)
Definition: jsregexp.cc:1862
static MUST_USE_RESULT MaybeHandle< Object > ThrowRegExpException(Handle< JSRegExp > re, Handle< String > pattern, Handle< String > error_text, const char *message)
Definition: jsregexp.cc:83
static const int kSurrogateRangeCount
Definition: jsregexp.cc:3589
static bool CompareRanges(ZoneList< CharacterRange > *ranges, const int *special_class, int length)
Definition: jsregexp.cc:4777
static void EmitHat(RegExpCompiler *compiler, RegExpNode *on_success, Trace *trace)
Definition: jsregexp.cc:3012
int32_t uc32
Definition: globals.h:185
static const int kDigitRangeCount
Definition: jsregexp.cc:3587
void MemCopy(void *dest, const void *src, size_t size)
Definition: utils.h:350
static bool ShortCutEmitCharacterPair(RegExpMacroAssembler *macro_assembler, bool one_byte, uc16 c1, uc16 c2, Label *on_failure)
Definition: jsregexp.cc:1670
static void AddClass(const int *elmv, int elmc, ZoneList< CharacterRange > *ranges, Zone *zone)
Definition: jsregexp.cc:5201
static void SetAtomLastCapture(FixedArray *array, String *subject, int from, int to)
Definition: jsregexp.cc:257
ContainedInLattice AddRange(ContainedInLattice containment, const int *ranges, int ranges_length, Interval new_range)
Definition: jsregexp.cc:99
const int kNumRegisters
Definition: constants-arm.h:34
Debugger support for the V8 JavaScript engine.
Definition: accessors.cc:20
static const int kMaxWidth
Definition: unicode.h:245
void AddFromFollowing(NodeInfo *that)
Definition: jsregexp.h:490
static const int kEatsAtLeastNotYetInitialized
Definition: jsregexp.h:1530