V8 Project
unicode-inl.h
Go to the documentation of this file.
1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_UNICODE_INL_H_
6 #define V8_UNICODE_INL_H_
7 
8 #include "src/unicode.h"
9 #include "src/base/logging.h"
10 #include "src/utils.h"
11 
12 namespace unibrow {
13 
14 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
15  CacheEntry entry = entries_[code_point & kMask];
16  if (entry.code_point_ == code_point) return entry.value_;
17  return CalculateValue(code_point);
18 }
19 
20 template <class T, int s> bool Predicate<T, s>::CalculateValue(
21  uchar code_point) {
22  bool result = T::Is(code_point);
23  entries_[code_point & kMask] = CacheEntry(code_point, result);
24  return result;
25 }
26 
27 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
28  uchar* result) {
29  CacheEntry entry = entries_[c & kMask];
30  if (entry.code_point_ == c) {
31  if (entry.offset_ == 0) {
32  return 0;
33  } else {
34  result[0] = c + entry.offset_;
35  return 1;
36  }
37  } else {
38  return CalculateValue(c, n, result);
39  }
40 }
41 
42 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
43  uchar* result) {
44  bool allow_caching = true;
45  int length = T::Convert(c, n, result, &allow_caching);
46  if (allow_caching) {
47  if (length == 1) {
48  entries_[c & kMask] = CacheEntry(c, result[0] - c);
49  return 1;
50  } else {
51  entries_[c & kMask] = CacheEntry(c, 0);
52  return 0;
53  }
54  } else {
55  return length;
56  }
57 }
58 
59 
62  switch (c) {
63  // This are equivalent characters in unicode.
64  case 0x39c:
65  case 0x3bc:
66  return 0xb5;
67  // This is an uppercase of a Latin-1 character
68  // outside of Latin-1.
69  case 0x178:
70  return 0xff;
71  }
72  return 0;
73 }
74 
75 
76 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
77  static const int kMask = ~(1 << 6);
78  if (c <= kMaxOneByteChar) {
79  str[0] = c;
80  return 1;
81  }
82  str[0] = 0xC0 | (c >> 6);
83  str[1] = 0x80 | (c & kMask);
84  return 2;
85 }
86 
87 // Encode encodes the UTF-16 code units c and previous into the given str
88 // buffer, and combines surrogate code units into single code points. If
89 // replace_invalid is set to true, orphan surrogate code units will be replaced
90 // with kBadChar.
91 unsigned Utf8::Encode(char* str,
92  uchar c,
93  int previous,
94  bool replace_invalid) {
95  static const int kMask = ~(1 << 6);
96  if (c <= kMaxOneByteChar) {
97  str[0] = c;
98  return 1;
99  } else if (c <= kMaxTwoByteChar) {
100  str[0] = 0xC0 | (c >> 6);
101  str[1] = 0x80 | (c & kMask);
102  return 2;
103  } else if (c <= kMaxThreeByteChar) {
104  if (Utf16::IsSurrogatePair(previous, c)) {
105  const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
106  return Encode(str - kUnmatchedSize,
107  Utf16::CombineSurrogatePair(previous, c),
109  replace_invalid) - kUnmatchedSize;
110  } else if (replace_invalid &&
113  c = kBadChar;
114  }
115  str[0] = 0xE0 | (c >> 12);
116  str[1] = 0x80 | ((c >> 6) & kMask);
117  str[2] = 0x80 | (c & kMask);
118  return 3;
119  } else {
120  str[0] = 0xF0 | (c >> 18);
121  str[1] = 0x80 | ((c >> 12) & kMask);
122  str[2] = 0x80 | ((c >> 6) & kMask);
123  str[3] = 0x80 | (c & kMask);
124  return 4;
125  }
126 }
127 
128 
129 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
130  if (length <= 0) return kBadChar;
131  byte first = bytes[0];
132  // Characters between 0000 and 0007F are encoded as a single character
133  if (first <= kMaxOneByteChar) {
134  *cursor += 1;
135  return first;
136  }
137  return CalculateValue(bytes, length, cursor);
138 }
139 
140 unsigned Utf8::Length(uchar c, int previous) {
141  if (c <= kMaxOneByteChar) {
142  return 1;
143  } else if (c <= kMaxTwoByteChar) {
144  return 2;
145  } else if (c <= kMaxThreeByteChar) {
146  if (Utf16::IsTrailSurrogate(c) &&
147  Utf16::IsLeadSurrogate(previous)) {
149  }
150  return 3;
151  } else {
152  return 4;
153  }
154 }
155 
157  : unbuffered_start_(NULL),
158  utf16_length_(0),
159  last_byte_of_buffer_unused_(false) {}
160 
162  unsigned buffer_length,
163  const uint8_t* stream,
164  unsigned stream_length) {
165  Reset(buffer, buffer_length, stream, stream_length);
166 }
167 
168 template<unsigned kBufferSize>
169 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
170  : Utf8DecoderBase(buffer_,
171  kBufferSize,
172  reinterpret_cast<const uint8_t*>(stream),
173  length) {
174 }
175 
176 template<unsigned kBufferSize>
177 void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
178  Utf8DecoderBase::Reset(buffer_,
179  kBufferSize,
180  reinterpret_cast<const uint8_t*>(stream),
181  length);
182 }
183 
184 template <unsigned kBufferSize>
186  unsigned length) const {
187  DCHECK(length > 0);
188  if (length > utf16_length_) length = utf16_length_;
189  // memcpy everything in buffer.
190  unsigned buffer_length =
191  last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
192  unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
193  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
194  if (length <= buffer_length) return length;
195  DCHECK(unbuffered_start_ != NULL);
196  // Copy the rest the slow way.
197  WriteUtf16Slow(unbuffered_start_,
198  data + buffer_length,
199  length - buffer_length);
200  return length;
201 }
202 
203 } // namespace unibrow
204 
205 #endif // V8_UNICODE_INL_H_
static const unsigned kMaxChar
Definition: unicode.h:118
static uint16_t ConvertNonLatin1ToLatin1(uint16_t)
Definition: unicode-inl.h:60
int CalculateValue(uchar c, uchar n, uchar *result)
Definition: unicode-inl.h:42
int get(uchar c, uchar n, uchar *result)
Definition: unicode-inl.h:27
bool get(uchar c)
Definition: unicode-inl.h:14
bool CalculateValue(uchar c)
Definition: unicode-inl.h:20
static const int kNoPreviousCharacter
Definition: unicode.h:97
static bool IsSurrogatePair(int lead, int trail)
Definition: unicode.h:82
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition: unicode.h:94
static bool IsTrailSurrogate(int code)
Definition: unicode.h:89
static bool IsLeadSurrogate(int code)
Definition: unicode.h:85
void Reset(uint16_t *buffer, unsigned buffer_length, const uint8_t *stream, unsigned stream_length)
Definition: unicode.cc:261
void Reset(const char *stream, unsigned length)
Definition: unicode-inl.h:177
unsigned WriteUtf16(uint16_t *data, unsigned length) const
Definition: unicode-inl.h:185
static const unsigned kMaxThreeByteChar
Definition: unicode.h:143
static const unsigned kMaxOneByteChar
Definition: unicode.h:141
static uchar ValueOf(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode-inl.h:129
static const uchar kBadChar
Definition: unicode.h:139
static uchar Length(uchar chr, int previous)
Definition: unicode-inl.h:140
static unsigned EncodeOneByte(char *out, uint8_t c)
Definition: unicode-inl.h:76
static const unsigned kMaxTwoByteChar
Definition: unicode.h:142
static unsigned Encode(char *out, uchar c, int previous, bool replace_invalid=false)
Definition: unicode-inl.h:91
static const unsigned kBytesSavedByCombiningSurrogates
Definition: unicode.h:148
static const unsigned kSizeOfUnmatchedSurrogate
Definition: unicode.h:149
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:191
enable harmony numeric enable harmony object literal extensions Optimize object Array DOM strings and string trace pretenuring decisions of HAllocate instructions Enables optimizations which favor memory size over execution speed maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining trace the tracking of allocation sites deoptimize every n garbage collections perform array bounds checks elimination analyze liveness of environment slots and zap dead values flushes the cache of optimized code for closures on every GC allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms do not emit check maps for constant values that have a leaf deoptimize the optimized code if the layout of the maps changes enable context specialization in TurboFan execution budget before interrupt is triggered max percentage of megamorphic generic ICs to allow optimization enable use of SAHF instruction if enable use of VFP3 instructions if available enable use of NEON instructions if enable use of SDIV and UDIV instructions if enable use of MLS instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of d16 d31 registers on ARM this requires VFP3 force all emitted branches to be in long enable alignment of csp to bytes on platforms which prefer the register to always be NULL
#define DCHECK(condition)
Definition: logging.h:205
unsigned short uint16_t
Definition: unicode.cc:23
unsigned int uchar
Definition: unicode.h:17
bool Is(Object *obj)
void MemCopy(void *dest, const void *src, size_t size)
Definition: utils.h:350
Definitions and convenience functions for working with unicode.