V8 Project
string-search.h
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_STRING_SEARCH_H_
6 #define V8_STRING_SEARCH_H_
7 
8 namespace v8 {
9 namespace internal {
10 
11 
12 //---------------------------------------------------------------------
13 // String Search object.
14 //---------------------------------------------------------------------
15 
16 // Class holding constants and methods that apply to all string search variants,
17 // independently of subject and pattern char size.
19  protected:
20  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
21  // limit, we can fix the size of tables. For a needle longer than this limit,
22  // search will not be optimal, since we only build tables for a suffix
23  // of the string, but it is a safe approximation.
24  static const int kBMMaxShift = Isolate::kBMMaxShift;
25 
26  // Reduce alphabet to this size.
27  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
28  // proportional to the input alphabet. We reduce the alphabet size by
29  // equating input characters modulo a smaller alphabet size. This gives
30  // a potentially less efficient searching, but is a safe approximation.
31  // For needles using only characters in the same Unicode 256-code point page,
32  // there is no search speed degradation.
33  static const int kLatin1AlphabetSize = 256;
35 
36  // Bad-char shift table stored in the state. It's length is the alphabet size.
37  // For patterns below this length, the skip length of Boyer-Moore is too short
38  // to compensate for the algorithmic overhead compared to simple brute force.
39  static const int kBMMinPatternLength = 7;
40 
41  static inline bool IsOneByteString(Vector<const uint8_t> string) {
42  return true;
43  }
44 
45  static inline bool IsOneByteString(Vector<const uc16> string) {
46  return String::IsOneByte(string.start(), string.length());
47  }
48 
49  friend class Isolate;
50 };
51 
52 
53 template <typename PatternChar, typename SubjectChar>
54 class StringSearch : private StringSearchBase {
55  public:
57  : isolate_(isolate),
58  pattern_(pattern),
59  start_(Max(0, pattern.length() - kBMMaxShift)) {
60  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
61  if (!IsOneByteString(pattern_)) {
63  return;
64  }
65  }
66  int pattern_length = pattern_.length();
67  if (pattern_length < kBMMinPatternLength) {
68  if (pattern_length == 1) {
70  return;
71  }
73  return;
74  }
76  }
77 
78  int Search(Vector<const SubjectChar> subject, int index) {
79  return strategy_(this, subject, index);
80  }
81 
82  static inline int AlphabetSize() {
83  if (sizeof(PatternChar) == 1) {
84  // Latin1 needle.
85  return kLatin1AlphabetSize;
86  } else {
87  DCHECK(sizeof(PatternChar) == 2);
88  // UC16 needle.
89  return kUC16AlphabetSize;
90  }
91  }
92 
93  private:
94  typedef int (*SearchFunction)( // NOLINT - it's not a cast!
97  int);
98 
101  int) {
102  return -1;
103  }
104 
107  int start_index);
108 
111  int start_index);
112 
115  int start_index);
116 
117  static int BoyerMooreHorspoolSearch(
120  int start_index);
121 
124  int start_index);
125 
127 
129 
130  static inline bool exceedsOneByte(uint8_t c) {
131  return false;
132  }
133 
134  static inline bool exceedsOneByte(uint16_t c) {
135  return c > String::kMaxOneByteCharCodeU;
136  }
137 
138  static inline int CharOccurrence(int* bad_char_occurrence,
139  SubjectChar char_code) {
140  if (sizeof(SubjectChar) == 1) {
141  return bad_char_occurrence[static_cast<int>(char_code)];
142  }
143  if (sizeof(PatternChar) == 1) {
144  if (exceedsOneByte(char_code)) {
145  return -1;
146  }
147  return bad_char_occurrence[static_cast<unsigned int>(char_code)];
148  }
149  // Both pattern and subject are UC16. Reduce character to equivalence class.
150  int equiv_class = char_code % kUC16AlphabetSize;
151  return bad_char_occurrence[equiv_class];
152  }
153 
154  // The following tables are shared by all searches.
155  // TODO(lrn): Introduce a way for a pattern to keep its tables
156  // between searches (e.g., for an Atom RegExp).
157 
158  // Store for the BoyerMoore(Horspool) bad char shift table.
159  // Return a table covering the last kBMMaxShift+1 positions of
160  // pattern.
161  int* bad_char_table() {
162  return isolate_->bad_char_shift_table();
163  }
164 
165  // Store for the BoyerMoore good suffix shift table.
167  // Return biased pointer that maps the range [start_..pattern_.length()
168  // to the kGoodSuffixShiftTable array.
169  return isolate_->good_suffix_shift_table() - start_;
170  }
171 
172  // Table used temporarily while building the BoyerMoore good suffix
173  // shift table.
174  int* suffix_table() {
175  // Return biased pointer that maps the range [start_..pattern_.length()
176  // to the kSuffixTable array.
177  return isolate_->suffix_table() - start_;
178  }
179 
181  // The pattern to search for.
183  // Pointer to implementation of the search.
185  // Cache value of Max(0, pattern_length() - kBMMaxShift)
186  int start_;
187 };
188 
189 
190 //---------------------------------------------------------------------
191 // Single Character Pattern Search Strategy
192 //---------------------------------------------------------------------
193 
194 template <typename PatternChar, typename SubjectChar>
198  int index) {
199  DCHECK_EQ(1, search->pattern_.length());
200  PatternChar pattern_first_char = search->pattern_[0];
201  int i = index;
202  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
203  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
204  memchr(subject.start() + i,
205  pattern_first_char,
206  subject.length() - i));
207  if (pos == NULL) return -1;
208  return static_cast<int>(pos - subject.start());
209  } else {
210  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
211  if (exceedsOneByte(pattern_first_char)) {
212  return -1;
213  }
214  }
215  SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
216  int n = subject.length();
217  while (i < n) {
218  if (subject[i++] == search_char) return i - 1;
219  }
220  return -1;
221  }
222 }
223 
224 //---------------------------------------------------------------------
225 // Linear Search Strategy
226 //---------------------------------------------------------------------
227 
228 
229 template <typename PatternChar, typename SubjectChar>
230 inline bool CharCompare(const PatternChar* pattern,
231  const SubjectChar* subject,
232  int length) {
233  DCHECK(length > 0);
234  int pos = 0;
235  do {
236  if (pattern[pos] != subject[pos]) {
237  return false;
238  }
239  pos++;
240  } while (pos < length);
241  return true;
242 }
243 
244 
245 // Simple linear search for short patterns. Never bails out.
246 template <typename PatternChar, typename SubjectChar>
250  int index) {
251  Vector<const PatternChar> pattern = search->pattern_;
252  DCHECK(pattern.length() > 1);
253  int pattern_length = pattern.length();
254  PatternChar pattern_first_char = pattern[0];
255  int i = index;
256  int n = subject.length() - pattern_length;
257  while (i <= n) {
258  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
259  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
260  memchr(subject.start() + i,
261  pattern_first_char,
262  n - i + 1));
263  if (pos == NULL) return -1;
264  i = static_cast<int>(pos - subject.start()) + 1;
265  } else {
266  if (subject[i++] != pattern_first_char) continue;
267  }
268  // Loop extracted to separate function to allow using return to do
269  // a deeper break.
270  if (CharCompare(pattern.start() + 1,
271  subject.start() + i,
272  pattern_length - 1)) {
273  return i - 1;
274  }
275  }
276  return -1;
277 }
278 
279 //---------------------------------------------------------------------
280 // Boyer-Moore string search
281 //---------------------------------------------------------------------
282 
283 template <typename PatternChar, typename SubjectChar>
287  int start_index) {
288  Vector<const PatternChar> pattern = search->pattern_;
289  int subject_length = subject.length();
290  int pattern_length = pattern.length();
291  // Only preprocess at most kBMMaxShift last characters of pattern.
292  int start = search->start_;
293 
294  int* bad_char_occurence = search->bad_char_table();
295  int* good_suffix_shift = search->good_suffix_shift_table();
296 
297  PatternChar last_char = pattern[pattern_length - 1];
298  int index = start_index;
299  // Continue search from i.
300  while (index <= subject_length - pattern_length) {
301  int j = pattern_length - 1;
302  int c;
303  while (last_char != (c = subject[index + j])) {
304  int shift =
305  j - CharOccurrence(bad_char_occurence, c);
306  index += shift;
307  if (index > subject_length - pattern_length) {
308  return -1;
309  }
310  }
311  while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
312  if (j < 0) {
313  return index;
314  } else if (j < start) {
315  // we have matched more than our tables allow us to be smart about.
316  // Fall back on BMH shift.
317  index += pattern_length - 1
318  - CharOccurrence(bad_char_occurence,
319  static_cast<SubjectChar>(last_char));
320  } else {
321  int gs_shift = good_suffix_shift[j + 1];
322  int bc_occ =
323  CharOccurrence(bad_char_occurence, c);
324  int shift = j - bc_occ;
325  if (gs_shift > shift) {
326  shift = gs_shift;
327  }
328  index += shift;
329  }
330  }
331 
332  return -1;
333 }
334 
335 
336 template <typename PatternChar, typename SubjectChar>
338  int pattern_length = pattern_.length();
339  const PatternChar* pattern = pattern_.start();
340  // Only look at the last kBMMaxShift characters of pattern (from start_
341  // to pattern_length).
342  int start = start_;
343  int length = pattern_length - start;
344 
345  // Biased tables so that we can use pattern indices as table indices,
346  // even if we only cover the part of the pattern from offset start.
347  int* shift_table = good_suffix_shift_table();
348  int* suffix_table = this->suffix_table();
349 
350  // Initialize table.
351  for (int i = start; i < pattern_length; i++) {
352  shift_table[i] = length;
353  }
354  shift_table[pattern_length] = 1;
355  suffix_table[pattern_length] = pattern_length + 1;
356 
357  if (pattern_length <= start) {
358  return;
359  }
360 
361  // Find suffixes.
362  PatternChar last_char = pattern[pattern_length - 1];
363  int suffix = pattern_length + 1;
364  {
365  int i = pattern_length;
366  while (i > start) {
367  PatternChar c = pattern[i - 1];
368  while (suffix <= pattern_length && c != pattern[suffix - 1]) {
369  if (shift_table[suffix] == length) {
370  shift_table[suffix] = suffix - i;
371  }
372  suffix = suffix_table[suffix];
373  }
374  suffix_table[--i] = --suffix;
375  if (suffix == pattern_length) {
376  // No suffix to extend, so we check against last_char only.
377  while ((i > start) && (pattern[i - 1] != last_char)) {
378  if (shift_table[pattern_length] == length) {
379  shift_table[pattern_length] = pattern_length - i;
380  }
381  suffix_table[--i] = pattern_length;
382  }
383  if (i > start) {
384  suffix_table[--i] = --suffix;
385  }
386  }
387  }
388  }
389  // Build shift table using suffixes.
390  if (suffix < pattern_length) {
391  for (int i = start; i <= pattern_length; i++) {
392  if (shift_table[i] == length) {
393  shift_table[i] = suffix - start;
394  }
395  if (i == suffix) {
396  suffix = suffix_table[suffix];
397  }
398  }
399  }
400 }
401 
402 //---------------------------------------------------------------------
403 // Boyer-Moore-Horspool string search.
404 //---------------------------------------------------------------------
405 
406 template <typename PatternChar, typename SubjectChar>
410  int start_index) {
411  Vector<const PatternChar> pattern = search->pattern_;
412  int subject_length = subject.length();
413  int pattern_length = pattern.length();
414  int* char_occurrences = search->bad_char_table();
415  int badness = -pattern_length;
416 
417  // How bad we are doing without a good-suffix table.
418  PatternChar last_char = pattern[pattern_length - 1];
419  int last_char_shift = pattern_length - 1 -
420  CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
421  // Perform search
422  int index = start_index; // No matches found prior to this index.
423  while (index <= subject_length - pattern_length) {
424  int j = pattern_length - 1;
425  int subject_char;
426  while (last_char != (subject_char = subject[index + j])) {
427  int bc_occ = CharOccurrence(char_occurrences, subject_char);
428  int shift = j - bc_occ;
429  index += shift;
430  badness += 1 - shift; // at most zero, so badness cannot increase.
431  if (index > subject_length - pattern_length) {
432  return -1;
433  }
434  }
435  j--;
436  while (j >= 0 && pattern[j] == (subject[index + j])) j--;
437  if (j < 0) {
438  return index;
439  } else {
440  index += last_char_shift;
441  // Badness increases by the number of characters we have
442  // checked, and decreases by the number of characters we
443  // can skip by shifting. It's a measure of how we are doing
444  // compared to reading each character exactly once.
445  badness += (pattern_length - j) - last_char_shift;
446  if (badness > 0) {
447  search->PopulateBoyerMooreTable();
448  search->strategy_ = &BoyerMooreSearch;
449  return BoyerMooreSearch(search, subject, index);
450  }
451  }
452  }
453  return -1;
454 }
455 
456 
457 template <typename PatternChar, typename SubjectChar>
459  int pattern_length = pattern_.length();
460 
461  int* bad_char_occurrence = bad_char_table();
462 
463  // Only preprocess at most kBMMaxShift last characters of pattern.
464  int start = start_;
465  // Run forwards to populate bad_char_table, so that *last* instance
466  // of character equivalence class is the one registered.
467  // Notice: Doesn't include the last character.
468  int table_size = AlphabetSize();
469  if (start == 0) { // All patterns less than kBMMaxShift in length.
470  memset(bad_char_occurrence,
471  -1,
472  table_size * sizeof(*bad_char_occurrence));
473  } else {
474  for (int i = 0; i < table_size; i++) {
475  bad_char_occurrence[i] = start - 1;
476  }
477  }
478  for (int i = start; i < pattern_length - 1; i++) {
479  PatternChar c = pattern_[i];
480  int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
481  bad_char_occurrence[bucket] = i;
482  }
483 }
484 
485 //---------------------------------------------------------------------
486 // Linear string search with bailout to BMH.
487 //---------------------------------------------------------------------
488 
489 // Simple linear search for short patterns, which bails out if the string
490 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
491 template <typename PatternChar, typename SubjectChar>
495  int index) {
496  Vector<const PatternChar> pattern = search->pattern_;
497  int pattern_length = pattern.length();
498  // Badness is a count of how much work we have done. When we have
499  // done enough work we decide it's probably worth switching to a better
500  // algorithm.
501  int badness = -10 - (pattern_length << 2);
502 
503  // We know our pattern is at least 2 characters, we cache the first so
504  // the common case of the first character not matching is faster.
505  PatternChar pattern_first_char = pattern[0];
506  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
507  badness++;
508  if (badness <= 0) {
509  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
510  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
511  memchr(subject.start() + i,
512  pattern_first_char,
513  n - i + 1));
514  if (pos == NULL) {
515  return -1;
516  }
517  i = static_cast<int>(pos - subject.start());
518  } else {
519  if (subject[i] != pattern_first_char) continue;
520  }
521  int j = 1;
522  do {
523  if (pattern[j] != subject[i + j]) {
524  break;
525  }
526  j++;
527  } while (j < pattern_length);
528  if (j == pattern_length) {
529  return i;
530  }
531  badness += j;
532  } else {
534  search->strategy_ = &BoyerMooreHorspoolSearch;
535  return BoyerMooreHorspoolSearch(search, subject, i);
536  }
537  }
538  return -1;
539 }
540 
541 
542 // Perform a a single stand-alone search.
543 // If searching multiple times for the same pattern, a search
544 // object should be constructed once and the Search function then called
545 // for each search.
546 template <typename SubjectChar, typename PatternChar>
547 int SearchString(Isolate* isolate,
550  int start_index) {
551  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
552  return search.Search(subject, start_index);
553 }
554 
555 }} // namespace v8::internal
556 
557 #endif // V8_STRING_SEARCH_H_
static const int kUC16AlphabetSize
Definition: isolate.h:822
static const int kBMMaxShift
Definition: isolate.h:823
static const int kLatin1AlphabetSize
Definition: string-search.h:33
static const int kUC16AlphabetSize
Definition: string-search.h:34
static const int kBMMinPatternLength
Definition: string-search.h:39
static bool IsOneByteString(Vector< const uint8_t > string)
Definition: string-search.h:41
static bool IsOneByteString(Vector< const uc16 > string)
Definition: string-search.h:45
static int BoyerMooreHorspoolSearch(StringSearch< PatternChar, SubjectChar > *search, Vector< const SubjectChar > subject, int start_index)
static int LinearSearch(StringSearch< PatternChar, SubjectChar > *search, Vector< const SubjectChar > subject, int start_index)
static bool exceedsOneByte(uint16_t c)
static int BoyerMooreSearch(StringSearch< PatternChar, SubjectChar > *search, Vector< const SubjectChar > subject, int start_index)
static int FailSearch(StringSearch< PatternChar, SubjectChar > *, Vector< const SubjectChar >, int)
Definition: string-search.h:99
static bool exceedsOneByte(uint8_t c)
static int InitialSearch(StringSearch< PatternChar, SubjectChar > *search, Vector< const SubjectChar > subject, int start_index)
static int SingleCharSearch(StringSearch< PatternChar, SubjectChar > *search, Vector< const SubjectChar > subject, int start_index)
StringSearch(Isolate *isolate, Vector< const PatternChar > pattern)
Definition: string-search.h:56
int Search(Vector< const SubjectChar > subject, int index)
Definition: string-search.h:78
static int CharOccurrence(int *bad_char_occurrence, SubjectChar char_code)
int(* SearchFunction)(StringSearch< PatternChar, SubjectChar > *, Vector< const SubjectChar >, int)
Definition: string-search.h:94
Vector< const PatternChar > pattern_
static const uint32_t kMaxOneByteCharCodeU
Definition: objects.h:8812
static bool IsOneByte(const uc16 *chars, int length)
Definition: objects.h:8895
T * start() const
Definition: vector.h:47
int length() const
Definition: vector.h:41
enable harmony numeric enable harmony object literal extensions Optimize object Array DOM strings and string trace pretenuring decisions of HAllocate instructions Enables optimizations which favor memory size over execution speed maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining trace the tracking of allocation sites deoptimize every n garbage collections perform array bounds checks elimination analyze liveness of environment slots and zap dead values flushes the cache of optimized code for closures on every GC allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms do not emit check maps for constant values that have a leaf deoptimize the optimized code if the layout of the maps changes enable context specialization in TurboFan execution budget before interrupt is triggered max percentage of megamorphic generic ICs to allow optimization enable use of SAHF instruction if enable use of VFP3 instructions if available enable use of NEON instructions if enable use of SDIV and UDIV instructions if enable use of MLS instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of d16 d31 registers on ARM this requires VFP3 force all emitted branches to be in long enable alignment of csp to bytes on platforms which prefer the register to always be NULL
enable harmony numeric enable harmony object literal extensions Optimize object Array shift
#define DCHECK(condition)
Definition: logging.h:205
#define DCHECK_EQ(v1, v2)
Definition: logging.h:206
unsigned short uint16_t
Definition: unicode.cc:23
int SearchString(Isolate *isolate, Vector< const SubjectChar > subject, Vector< const PatternChar > pattern, int start_index)
bool CharCompare(const PatternChar *pattern, const SubjectChar *subject, int length)
static LifetimePosition Max(LifetimePosition a, LifetimePosition b)
Debugger support for the V8 JavaScript engine.
Definition: accessors.cc:20