V8 Project
scanner-character-streams.cc
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/v8.h"
6 
8 
9 #include "include/v8.h"
10 #include "src/handles.h"
11 #include "src/unicode-inl.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 namespace {
17 
18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
19  unsigned* src_pos, unsigned src_length,
21  if (encoding == ScriptCompiler::StreamedSource::UTF8) {
23  dest, length, src, src_pos, src_length);
24  }
25 
26  unsigned to_fill = length;
27  if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos;
28 
30  v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);
31  } else {
33  v8::internal::CopyChars<uint16_t, uint16_t>(
34  dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill);
35  }
36  *src_pos += to_fill;
37  return to_fill;
38 }
39 
40 } // namespace
41 
42 
43 // ----------------------------------------------------------------------------
44 // BufferedUtf16CharacterStreams
45 
48  pushback_limit_(NULL) {
49  // Initialize buffer as being empty. First read will fill the buffer.
52 }
53 
54 
56 
58  if (character == kEndOfInput) {
59  pos_--;
60  return;
61  }
63  // buffer_ is writable, buffer_cursor_ is const pointer.
64  buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
65  pos_--;
66  return;
67  }
68  SlowPushBack(static_cast<uc16>(character));
69 }
70 
71 
73  // In pushback mode, the end of the buffer contains pushback,
74  // and the start of the buffer (from buffer start to pushback_limit_)
75  // contains valid data that comes just after the pushback.
76  // We NULL the pushback_limit_ if pushing all the way back to the
77  // start of the buffer.
78 
79  if (pushback_limit_ == NULL) {
80  // Enter pushback mode.
84  }
85  // Ensure that there is room for at least one pushback.
87  DCHECK(pos_ > 0);
88  buffer_[--buffer_cursor_ - buffer_] = character;
89  if (buffer_cursor_ == buffer_) {
91  } else if (buffer_cursor_ < pushback_limit_) {
93  }
94  pos_--;
95 }
96 
97 
100  if (pushback_limit_ != NULL) {
101  // Leave pushback mode.
104  // If there were any valid characters left at the
105  // start of the buffer, use those.
106  if (buffer_cursor_ < buffer_end_) return true;
107  // Otherwise read a new block.
108  }
109  unsigned length = FillBuffer(pos_);
110  buffer_end_ = buffer_ + length;
111  return length > 0;
112 }
113 
114 
116  // Leave pushback mode (i.e., ignore that there might be valid data
117  // in the buffer before the pushback_limit_ point).
119  return BufferSeekForward(delta);
120 }
121 
122 
123 // ----------------------------------------------------------------------------
124 // GenericStringUtf16CharacterStream
125 
126 
128  Handle<String> data,
129  unsigned start_position,
130  unsigned end_position)
131  : string_(data),
132  length_(end_position) {
133  DCHECK(end_position >= start_position);
134  pos_ = start_position;
135 }
136 
137 
139 
140 
142  unsigned old_pos = pos_;
143  pos_ = Min(pos_ + delta, length_);
144  ReadBlock();
145  return pos_ - old_pos;
146 }
147 
148 
150  if (from_pos >= length_) return 0;
151  unsigned length = kBufferSize;
152  if (from_pos + length > length_) {
153  length = length_ - from_pos;
154  }
155  String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
156  return length;
157 }
158 
159 
160 // ----------------------------------------------------------------------------
161 // Utf8ToUtf16CharacterStream
163  unsigned length)
165  raw_data_(data),
166  raw_data_length_(length),
167  raw_data_pos_(0),
168  raw_character_position_(0) {
169  ReadBlock();
170 }
171 
172 
174 
175 
176 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length,
177  const byte* src,
178  unsigned* src_pos,
179  unsigned src_length) {
180  static const unibrow::uchar kMaxUtf16Character = 0xffff;
181  unsigned i = 0;
182  // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
183  // one character early (in the normal case), because we need to have at least
184  // two free spaces in the buffer to be sure that the next character will fit.
185  while (i < length - 1) {
186  if (*src_pos == src_length) break;
187  unibrow::uchar c = src[*src_pos];
189  *src_pos = *src_pos + 1;
190  } else {
191  c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
192  src_pos);
193  }
194  if (c > kMaxUtf16Character) {
195  dest[i++] = unibrow::Utf16::LeadSurrogate(c);
196  dest[i++] = unibrow::Utf16::TrailSurrogate(c);
197  } else {
198  dest[i++] = static_cast<uc16>(c);
199  }
200  }
201  return i;
202 }
203 
204 
206  unsigned old_pos = pos_;
207  unsigned target_pos = pos_ + delta;
208  SetRawPosition(target_pos);
210  ReadBlock();
211  return pos_ - old_pos;
212 }
213 
214 
215 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
216  SetRawPosition(char_position);
217  if (raw_character_position_ != char_position) {
218  // char_position was not a valid position in the stream (hit the end
219  // while spooling to it).
220  return 0u;
221  }
224  raw_character_position_ = char_position + i;
225  return i;
226 }
227 
228 
229 static const byte kUtf8MultiByteMask = 0xC0;
230 static const byte kUtf8MultiByteCharFollower = 0x80;
231 
232 
233 #ifdef DEBUG
234 static const byte kUtf8MultiByteCharStart = 0xC0;
235 static bool IsUtf8MultiCharacterStart(byte first_byte) {
236  return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
237 }
238 #endif
239 
240 
241 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
242  return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
243 }
244 
245 
246 // Move the cursor back to point at the preceding UTF-8 character start
247 // in the buffer.
248 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
249  byte character = buffer[--*cursor];
250  if (character > unibrow::Utf8::kMaxOneByteChar) {
252  // Last byte of a multi-byte character encoding. Step backwards until
253  // pointing to the first byte of the encoding, recognized by having the
254  // top two bits set.
255  while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
256  DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));
257  }
258 }
259 
260 
261 // Move the cursor forward to point at the next following UTF-8 character start
262 // in the buffer.
263 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
264  byte character = buffer[(*cursor)++];
265  if (character > unibrow::Utf8::kMaxOneByteChar) {
266  // First character of a multi-byte character encoding.
267  // The number of most-significant one-bits determines the length of the
268  // encoding:
269  // 110..... - (0xCx, 0xDx) one additional byte (minimum).
270  // 1110.... - (0xEx) two additional bytes.
271  // 11110... - (0xFx) three additional bytes (maximum).
272  DCHECK(IsUtf8MultiCharacterStart(character));
273  // Additional bytes is:
274  // 1 if value in range 0xC0 .. 0xDF.
275  // 2 if value in range 0xE0 .. 0xEF.
276  // 3 if value in range 0xF0 .. 0xF7.
277  // Encode that in a single value.
278  unsigned additional_bytes =
279  ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
280  *cursor += additional_bytes;
281  DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
282  }
283 }
284 
285 
286 // This can't set a raw position between two surrogate pairs, since there
287 // is no position in the UTF8 stream that corresponds to that. This assumes
288 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If
289 // it is illegally coded as two 3 byte sequences then there is no problem here.
290 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
291  if (raw_character_position_ > target_position) {
292  // Spool backwards in utf8 buffer.
293  do {
294  int old_pos = raw_data_pos_;
297  DCHECK(old_pos - raw_data_pos_ <= 4);
298  // Step back over both code units for surrogate pairs.
299  if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
300  } while (raw_character_position_ > target_position);
301  // No surrogate pair splitting.
302  DCHECK(raw_character_position_ == target_position);
303  return;
304  }
305  // Spool forwards in the utf8 buffer.
306  while (raw_character_position_ < target_position) {
307  if (raw_data_pos_ == raw_data_length_) return;
308  int old_pos = raw_data_pos_;
311  DCHECK(raw_data_pos_ - old_pos <= 4);
312  if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
313  }
314  // No surrogate pair splitting.
315  DCHECK(raw_character_position_ == target_position);
316 }
317 
318 
319 unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
320  // Ignore "position" which is the position in the decoded data. Instead,
321  // ExternalStreamingStream keeps track of the position in the raw data.
322  unsigned data_in_buffer = 0;
323  // Note that the UTF-8 decoder might not be able to fill the buffer
324  // completely; it will typically leave the last character empty (see
325  // Utf8ToUtf16CharacterStream::CopyChars).
326  while (data_in_buffer < kBufferSize - 1) {
327  if (current_data_ == NULL) {
328  // GetSomeData will wait until the embedder has enough data. Here's an
329  // interface between the API which uses size_t (which is the correct type
330  // here) and the internal parts which use unsigned. TODO(marja): make the
331  // internal parts use size_t too.
333  static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));
335  bool data_ends = current_data_length_ == 0;
336 
337  // A caveat: a data chunk might end with bytes from an incomplete UTF-8
338  // character (the rest of the bytes will be in the next chunk).
340  HandleUtf8SplitCharacters(&data_in_buffer);
341  if (!data_ends && current_data_offset_ == current_data_length_) {
342  // The data stream didn't end, but we used all the data in the
343  // chunk. This will only happen when the chunk was really small. We
344  // don't handle the case where a UTF-8 character is split over several
345  // chunks; in that case V8 won't crash, but it will be a parse error.
346  delete[] current_data_;
350  continue; // Request a new chunk.
351  }
352  }
353 
354  // Did the data stream end?
355  if (data_ends) {
357  return data_in_buffer;
358  }
359  }
360 
361  // Fill the buffer from current_data_.
362  unsigned new_offset = 0;
363  unsigned new_chars_in_buffer =
364  CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,
365  current_data_ + current_data_offset_, &new_offset,
367  data_in_buffer += new_chars_in_buffer;
368  current_data_offset_ += new_offset;
369  DCHECK(data_in_buffer <= kBufferSize);
370 
371  // Did we use all the data in the data chunk?
373  delete[] current_data_;
377  }
378  }
379  return data_in_buffer;
380 }
381 
383  unsigned* data_in_buffer) {
384  // First check if we have leftover data from the last chunk.
385  unibrow::uchar c;
387  // Move the bytes which are part of the split character (which started in
388  // the previous chunk) into utf8_split_char_buffer_.
396  }
397 
398  // Convert the data in utf8_split_char_buffer_.
399  unsigned new_offset = 0;
400  unsigned new_chars_in_buffer =
401  CopyCharsHelper(buffer_ + *data_in_buffer,
402  kBufferSize - *data_in_buffer, utf8_split_char_buffer_,
404  *data_in_buffer += new_chars_in_buffer;
405  // Make sure we used all the data.
406  DCHECK(new_offset == utf8_split_char_buffer_length_);
407  DCHECK(*data_in_buffer <= kBufferSize);
408 
410  }
411 
412  // Move bytes which are part of an incomplete character from the end of the
413  // current chunk to utf8_split_char_buffer_. They will be converted when the
414  // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
415  // bytes long, but if the data is invalid, we can have character values bigger
416  // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
423  }
425  for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
427  }
428 }
429 
430 
431 // ----------------------------------------------------------------------------
432 // ExternalTwoByteStringUtf16CharacterStream
433 
436 
437 
441  int start_position,
442  int end_position)
444  source_(data),
445  raw_data_(data->GetTwoByteData(start_position)) {
446  buffer_cursor_ = raw_data_,
447  buffer_end_ = raw_data_ + (end_position - start_position);
448  pos_ = start_position;
449 }
450 
451 } } // namespace v8::internal
static uint16_t LeadSurrogate(uint32_t char_code)
Definition: unicode.h:108
static uint16_t TrailSurrogate(uint32_t char_code)
Definition: unicode.h:111
static const unsigned kMaxOneByteChar
Definition: unicode.h:141
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:191
virtual size_t GetMoreData(const uint8_t **src)=0
V8 calls this to request the next chunk of data from the embedder.
virtual unsigned BufferSeekForward(unsigned delta)=0
virtual unsigned FillBuffer(unsigned position)=0
ScriptCompiler::ExternalSourceStream * source_stream_
v8::ScriptCompiler::StreamedSource::Encoding encoding_
virtual unsigned FillBuffer(unsigned position)
void HandleUtf8SplitCharacters(unsigned *data_in_buffer)
ExternalTwoByteStringUtf16CharacterStream(Handle< ExternalTwoByteString > data, int start_position, int end_position)
GenericStringUtf16CharacterStream(Handle< String > data, unsigned start_position, unsigned end_position)
const uint16_t * buffer_end_
Definition: scanner.h:103
static const uc32 kEndOfInput
Definition: scanner.h:93
const uint16_t * buffer_cursor_
Definition: scanner.h:102
virtual unsigned FillBuffer(unsigned char_position)
Utf8ToUtf16CharacterStream(const byte *data, unsigned length)
virtual unsigned BufferSeekForward(unsigned delta)
static unsigned CopyChars(uint16_t *dest, unsigned length, const byte *src, unsigned *src_pos, unsigned src_length)
enable harmony numeric enable harmony object literal extensions Optimize object Array DOM strings and string trace pretenuring decisions of HAllocate instructions Enables optimizations which favor memory size over execution speed maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining trace the tracking of allocation sites deoptimize every n garbage collections perform array bounds checks elimination analyze liveness of environment slots and zap dead values flushes the cache of optimized code for closures on every GC allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms do not emit check maps for constant values that have a leaf deoptimize the optimized code if the layout of the maps changes enable context specialization in TurboFan execution budget before interrupt is triggered max percentage of megamorphic generic ICs to allow optimization enable use of SAHF instruction if enable use of VFP3 instructions if available enable use of NEON instructions if enable use of SDIV and UDIV instructions if enable use of MLS instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of d16 d31 registers on ARM this requires VFP3 force all emitted branches to be in long enable alignment of csp to bytes on platforms which prefer the register to always be NULL
#define CHECK(condition)
Definition: logging.h:36
#define DCHECK(condition)
Definition: logging.h:205
unsigned short uint16_t
Definition: unicode.cc:23
unsigned int uchar
Definition: unicode.h:17
unsigned CopyCharsHelper(uint16_t *dest, unsigned length, const uint8_t *src, unsigned *src_pos, unsigned src_length, ScriptCompiler::StreamedSource::Encoding encoding)
static const byte kUtf8MultiByteCharFollower
static LifetimePosition Min(LifetimePosition a, LifetimePosition b)
static const byte kUtf8MultiByteMask
static void Utf8CharacterBack(const byte *buffer, unsigned *cursor)
uint16_t uc16
Definition: globals.h:184
int32_t uc32
Definition: globals.h:185
static bool IsUtf8MultiCharacterFollower(byte later_byte)
static void Utf8CharacterForward(const byte *buffer, unsigned *cursor)
Debugger support for the V8 JavaScript engine.
Definition: accessors.cc:20