V8 Project
unibrow::Utf8 Class Reference

#include <unicode.h>

+ Collaboration diagram for unibrow::Utf8:

Static Public Member Functions

static uchar Length (uchar chr, int previous)
 
static unsigned EncodeOneByte (char *out, uint8_t c)
 
static unsigned Encode (char *out, uchar c, int previous, bool replace_invalid=false)
 
static uchar CalculateValue (const byte *str, unsigned length, unsigned *cursor)
 
static uchar ValueOf (const byte *str, unsigned length, unsigned *cursor)
 

Static Public Attributes

static const uchar kBadChar = 0xFFFD
 
static const unsigned kMaxEncodedSize = 4
 
static const unsigned kMaxOneByteChar = 0x7f
 
static const unsigned kMaxTwoByteChar = 0x7ff
 
static const unsigned kMaxThreeByteChar = 0xffff
 
static const unsigned kMaxFourByteChar = 0x1fffff
 
static const unsigned kBytesSavedByCombiningSurrogates = 2
 
static const unsigned kSizeOfUnmatchedSurrogate = 3
 
static const unsigned kMax16BitCodeUnitSize = 3
 

Detailed Description

Definition at line 125 of file unicode.h.

Member Function Documentation

◆ CalculateValue()

uchar unibrow::Utf8::CalculateValue ( const byte str,
unsigned  length,
unsigned *  cursor 
)
static

Definition at line 191 of file unicode.cc.

193  {
194  // We only get called for non-ASCII characters.
195  if (length == 1) {
196  *cursor += 1;
197  return kBadChar;
198  }
199  byte first = str[0];
200  byte second = str[1] ^ 0x80;
201  if (second & 0xC0) {
202  *cursor += 1;
203  return kBadChar;
204  }
205  if (first < 0xE0) {
206  if (first < 0xC0) {
207  *cursor += 1;
208  return kBadChar;
209  }
210  uchar code_point = ((first << 6) | second) & kMaxTwoByteChar;
211  if (code_point <= kMaxOneByteChar) {
212  *cursor += 1;
213  return kBadChar;
214  }
215  *cursor += 2;
216  return code_point;
217  }
218  if (length == 2) {
219  *cursor += 1;
220  return kBadChar;
221  }
222  byte third = str[2] ^ 0x80;
223  if (third & 0xC0) {
224  *cursor += 1;
225  return kBadChar;
226  }
227  if (first < 0xF0) {
228  uchar code_point = ((((first << 6) | second) << 6) | third)
230  if (code_point <= kMaxTwoByteChar) {
231  *cursor += 1;
232  return kBadChar;
233  }
234  *cursor += 3;
235  return code_point;
236  }
237  if (length == 3) {
238  *cursor += 1;
239  return kBadChar;
240  }
241  byte fourth = str[3] ^ 0x80;
242  if (fourth & 0xC0) {
243  *cursor += 1;
244  return kBadChar;
245  }
246  if (first < 0xF8) {
247  uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth)
249  if (code_point <= kMaxThreeByteChar) {
250  *cursor += 1;
251  return kBadChar;
252  }
253  *cursor += 4;
254  return code_point;
255  }
256  *cursor += 1;
257  return kBadChar;
258 }
static const unsigned kMaxThreeByteChar
Definition: unicode.h:143
static const unsigned kMaxOneByteChar
Definition: unicode.h:141
static const uchar kBadChar
Definition: unicode.h:139
static const unsigned kMaxFourByteChar
Definition: unicode.h:144
static const unsigned kMaxTwoByteChar
Definition: unicode.h:142
unsigned int uchar
Definition: unicode.h:17

References kBadChar, kMaxFourByteChar, kMaxOneByteChar, kMaxThreeByteChar, and kMaxTwoByteChar.

Referenced by v8::internal::Utf8ToUtf16CharacterStream::CopyChars(), v8::internal::HeapSnapshotJSONSerializer::SerializeString(), and ValueOf().

+ Here is the caller graph for this function:

◆ Encode()

unsigned unibrow::Utf8::Encode ( char *  out,
uchar  c,
int  previous,
bool  replace_invalid = false 
)
inlinestatic

Definition at line 91 of file unicode-inl.h.

94  {
95  static const int kMask = ~(1 << 6);
96  if (c <= kMaxOneByteChar) {
97  str[0] = c;
98  return 1;
99  } else if (c <= kMaxTwoByteChar) {
100  str[0] = 0xC0 | (c >> 6);
101  str[1] = 0x80 | (c & kMask);
102  return 2;
103  } else if (c <= kMaxThreeByteChar) {
104  if (Utf16::IsSurrogatePair(previous, c)) {
105  const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
106  return Encode(str - kUnmatchedSize,
107  Utf16::CombineSurrogatePair(previous, c),
109  replace_invalid) - kUnmatchedSize;
110  } else if (replace_invalid &&
113  c = kBadChar;
114  }
115  str[0] = 0xE0 | (c >> 12);
116  str[1] = 0x80 | ((c >> 6) & kMask);
117  str[2] = 0x80 | (c & kMask);
118  return 3;
119  } else {
120  str[0] = 0xF0 | (c >> 18);
121  str[1] = 0x80 | ((c >> 12) & kMask);
122  str[2] = 0x80 | ((c >> 6) & kMask);
123  str[3] = 0x80 | (c & kMask);
124  return 4;
125  }
126 }
static const int kNoPreviousCharacter
Definition: unicode.h:97
static bool IsSurrogatePair(int lead, int trail)
Definition: unicode.h:82
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition: unicode.h:94
static bool IsTrailSurrogate(int code)
Definition: unicode.h:89
static bool IsLeadSurrogate(int code)
Definition: unicode.h:85
static unsigned Encode(char *out, uchar c, int previous, bool replace_invalid=false)
Definition: unicode-inl.h:91
static const unsigned kSizeOfUnmatchedSurrogate
Definition: unicode.h:149

References unibrow::Utf16::CombineSurrogatePair(), unibrow::Utf16::IsLeadSurrogate(), unibrow::Utf16::IsSurrogatePair(), unibrow::Utf16::IsTrailSurrogate(), kBadChar, kMaxOneByteChar, kMaxThreeByteChar, kMaxTwoByteChar, unibrow::Utf16::kNoPreviousCharacter, and kSizeOfUnmatchedSurrogate.

Referenced by v8::internal::CodeEventLogger::NameBuffer::AppendString(), and v8::internal::String::ToCString().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ EncodeOneByte()

unsigned unibrow::Utf8::EncodeOneByte ( char *  out,
uint8_t  c 
)
inlinestatic

Definition at line 76 of file unicode-inl.h.

76  {
77  static const int kMask = ~(1 << 6);
78  if (c <= kMaxOneByteChar) {
79  str[0] = c;
80  return 1;
81  }
82  str[0] = 0xC0 | (c >> 6);
83  str[1] = 0x80 | (c & kMask);
84  return 2;
85 }

References kMaxOneByteChar.

◆ Length()

unsigned unibrow::Utf8::Length ( uchar  chr,
int  previous 
)
inlinestatic

Definition at line 140 of file unicode-inl.h.

140  {
141  if (c <= kMaxOneByteChar) {
142  return 1;
143  } else if (c <= kMaxTwoByteChar) {
144  return 2;
145  } else if (c <= kMaxThreeByteChar) {
146  if (Utf16::IsTrailSurrogate(c) &&
147  Utf16::IsLeadSurrogate(previous)) {
149  }
150  return 3;
151  } else {
152  return 4;
153  }
154 }
static const unsigned kBytesSavedByCombiningSurrogates
Definition: unicode.h:148

References unibrow::Utf16::IsLeadSurrogate(), unibrow::Utf16::IsTrailSurrogate(), kBytesSavedByCombiningSurrogates, kMaxOneByteChar, kMaxThreeByteChar, kMaxTwoByteChar, and kSizeOfUnmatchedSurrogate.

Referenced by v8::internal::CodeEventLogger::NameBuffer::AppendString(), v8::internal::String::ToCString(), and v8::Utf8LengthHelper::Visitor::VisitTwoByteString().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ ValueOf()

uchar unibrow::Utf8::ValueOf ( const byte str,
unsigned  length,
unsigned *  cursor 
)
inlinestatic

Definition at line 129 of file unicode-inl.h.

129  {
130  if (length <= 0) return kBadChar;
131  byte first = bytes[0];
132  // Characters between 0000 and 0007F are encoded as a single character
133  if (first <= kMaxOneByteChar) {
134  *cursor += 1;
135  return first;
136  }
137  return CalculateValue(bytes, length, cursor);
138 }
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:191

References CalculateValue(), kBadChar, and kMaxOneByteChar.

Referenced by v8::internal::StringHasher::ComputeUtf8Hash(), v8::internal::String::IsUtf8EqualTo(), unibrow::Utf8DecoderBase::Reset(), v8::internal::WriteTwoByteData(), and unibrow::Utf8DecoderBase::WriteUtf16Slow().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

Member Data Documentation

◆ kBadChar

const uchar unibrow::Utf8::kBadChar = 0xFFFD
static

◆ kBytesSavedByCombiningSurrogates

const unsigned unibrow::Utf8::kBytesSavedByCombiningSurrogates = 2
static

◆ kMax16BitCodeUnitSize

const unsigned unibrow::Utf8::kMax16BitCodeUnitSize = 3
static

Definition at line 152 of file unicode.h.

Referenced by v8::String::WriteUtf8().

◆ kMaxEncodedSize

const unsigned unibrow::Utf8::kMaxEncodedSize = 4
static

◆ kMaxFourByteChar

const unsigned unibrow::Utf8::kMaxFourByteChar = 0x1fffff
static

Definition at line 144 of file unicode.h.

Referenced by CalculateValue().

◆ kMaxOneByteChar

◆ kMaxThreeByteChar

const unsigned unibrow::Utf8::kMaxThreeByteChar = 0xffff
static

Definition at line 143 of file unicode.h.

Referenced by CalculateValue(), Encode(), and Length().

◆ kMaxTwoByteChar

const unsigned unibrow::Utf8::kMaxTwoByteChar = 0x7ff
static

Definition at line 142 of file unicode.h.

Referenced by CalculateValue(), Encode(), and Length().

◆ kSizeOfUnmatchedSurrogate

const unsigned unibrow::Utf8::kSizeOfUnmatchedSurrogate = 3
static

Definition at line 149 of file unicode.h.

Referenced by Encode(), and Length().


The documentation for this class was generated from the following files: