V8 Project
unicode.h
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_UNICODE_H_
6 #define V8_UNICODE_H_
7 
8 #include <sys/types.h>
9 #include "src/globals.h"
10 /**
11  * \file
12  * Definitions and convenience functions for working with unicode.
13  */
14 
15 namespace unibrow {
16 
17 typedef unsigned int uchar;
18 typedef unsigned char byte;
19 
20 /**
21  * The max length of the result of converting the case of a single
22  * character.
23  */
24 const int kMaxMappingSize = 4;
25 
26 template <class T, int size = 256>
27 class Predicate {
28  public:
29  inline Predicate() { }
30  inline bool get(uchar c);
31  private:
32  friend class Test;
33  bool CalculateValue(uchar c);
34  struct CacheEntry {
35  inline CacheEntry() : code_point_(0), value_(0) { }
36  inline CacheEntry(uchar code_point, bool value)
37  : code_point_(code_point),
38  value_(value) { }
40  bool value_ : 1;
41  };
42  static const int kSize = size;
43  static const int kMask = kSize - 1;
45 };
46 
47 // A cache used in case conversion. It caches the value for characters
48 // that either have no mapping or map to a single character independent
49 // of context. Characters that map to more than one character or that
50 // map differently depending on context are always looked up.
51 template <class T, int size = 256>
52 class Mapping {
53  public:
54  inline Mapping() { }
55  inline int get(uchar c, uchar n, uchar* result);
56  private:
57  friend class Test;
58  int CalculateValue(uchar c, uchar n, uchar* result);
59  struct CacheEntry {
60  inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
61  inline CacheEntry(uchar code_point, signed offset)
62  : code_point_(code_point),
63  offset_(offset) { }
65  signed offset_;
66  static const int kNoChar = (1 << 21) - 1;
67  };
68  static const int kSize = size;
69  static const int kMask = kSize - 1;
71 };
72 
73 class UnicodeData {
74  private:
75  friend class Test;
76  static int GetByteCount();
77  static const uchar kMaxCodePoint;
78 };
79 
80 class Utf16 {
81  public:
82  static inline bool IsSurrogatePair(int lead, int trail) {
83  return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
84  }
85  static inline bool IsLeadSurrogate(int code) {
86  if (code == kNoPreviousCharacter) return false;
87  return (code & 0xfc00) == 0xd800;
88  }
89  static inline bool IsTrailSurrogate(int code) {
90  if (code == kNoPreviousCharacter) return false;
91  return (code & 0xfc00) == 0xdc00;
92  }
93 
94  static inline int CombineSurrogatePair(uchar lead, uchar trail) {
95  return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
96  }
97  static const int kNoPreviousCharacter = -1;
98  static const uchar kMaxNonSurrogateCharCode = 0xffff;
99  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
100  // of UTF-8 data. The special case where the unit is a surrogate
101  // trail produces 1 byte net, because the encoding of the pair is
102  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
103  // can be reclaimed.
105  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
106  // The illegality stems from the surrogate not being part of a pair.
107  static const int kUtf8BytesToCodeASurrogate = 3;
108  static inline uint16_t LeadSurrogate(uint32_t char_code) {
109  return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
110  }
111  static inline uint16_t TrailSurrogate(uint32_t char_code) {
112  return 0xdc00 + (char_code & 0x3ff);
113  }
114 };
115 
116 class Latin1 {
117  public:
118  static const unsigned kMaxChar = 0xff;
119  // Returns 0 if character does not convert to single latin-1 character
120  // or if the character doesn't not convert back to latin-1 via inverse
121  // operation (upper to lower, etc).
123 };
124 
125 class Utf8 {
126  public:
127  static inline uchar Length(uchar chr, int previous);
128  static inline unsigned EncodeOneByte(char* out, uint8_t c);
129  static inline unsigned Encode(char* out,
130  uchar c,
131  int previous,
132  bool replace_invalid = false);
133  static uchar CalculateValue(const byte* str,
134  unsigned length,
135  unsigned* cursor);
136 
137  // The unicode replacement character, used to signal invalid unicode
138  // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
139  static const uchar kBadChar = 0xFFFD;
140  static const unsigned kMaxEncodedSize = 4;
141  static const unsigned kMaxOneByteChar = 0x7f;
142  static const unsigned kMaxTwoByteChar = 0x7ff;
143  static const unsigned kMaxThreeByteChar = 0xffff;
144  static const unsigned kMaxFourByteChar = 0x1fffff;
145 
146  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
147  // that match are coded as a 4 byte UTF-8 sequence.
148  static const unsigned kBytesSavedByCombiningSurrogates = 2;
149  static const unsigned kSizeOfUnmatchedSurrogate = 3;
150  // The maximum size a single UTF-16 code unit may take up when encoded as
151  // UTF-8.
152  static const unsigned kMax16BitCodeUnitSize = 3;
153  static inline uchar ValueOf(const byte* str,
154  unsigned length,
155  unsigned* cursor);
156 };
157 
158 
160  public:
161  // Initialization done in subclass.
162  inline Utf8DecoderBase();
163  inline Utf8DecoderBase(uint16_t* buffer,
164  unsigned buffer_length,
165  const uint8_t* stream,
166  unsigned stream_length);
167  inline unsigned Utf16Length() const { return utf16_length_; }
168  protected:
169  // This reads all characters and sets the utf16_length_.
170  // The first buffer_length utf16 chars are cached in the buffer.
171  void Reset(uint16_t* buffer,
172  unsigned buffer_length,
173  const uint8_t* stream,
174  unsigned stream_length);
175  static void WriteUtf16Slow(const uint8_t* stream,
176  uint16_t* data,
177  unsigned length);
178  const uint8_t* unbuffered_start_;
179  unsigned utf16_length_;
181  private:
183 };
184 
185 template <unsigned kBufferSize>
186 class Utf8Decoder : public Utf8DecoderBase {
187  public:
188  inline Utf8Decoder() {}
189  inline Utf8Decoder(const char* stream, unsigned length);
190  inline void Reset(const char* stream, unsigned length);
191  inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
192  private:
194 };
195 
196 
197 struct Uppercase {
198  static bool Is(uchar c);
199 };
200 struct Lowercase {
201  static bool Is(uchar c);
202 };
203 struct Letter {
204  static bool Is(uchar c);
205 };
206 struct Number {
207  static bool Is(uchar c);
208 };
209 struct WhiteSpace {
210  static bool Is(uchar c);
211 };
213  static bool Is(uchar c);
214 };
216  static bool Is(uchar c);
217 };
219  static bool Is(uchar c);
220 };
221 struct ToLowercase {
222  static const int kMaxWidth = 3;
223  static const bool kIsToLower = true;
224  static int Convert(uchar c,
225  uchar n,
226  uchar* result,
227  bool* allow_caching_ptr);
228 };
229 struct ToUppercase {
230  static const int kMaxWidth = 3;
231  static const bool kIsToLower = false;
232  static int Convert(uchar c,
233  uchar n,
234  uchar* result,
235  bool* allow_caching_ptr);
236 };
238  static const int kMaxWidth = 1;
239  static int Convert(uchar c,
240  uchar n,
241  uchar* result,
242  bool* allow_caching_ptr);
243 };
245  static const int kMaxWidth = 4;
246  static int Convert(uchar c,
247  uchar n,
248  uchar* result,
249  bool* allow_caching_ptr);
250 };
252  static const int kMaxWidth = 1;
253  static int Convert(uchar c,
254  uchar n,
255  uchar* result,
256  bool* allow_caching_ptr);
257 };
258 
259 } // namespace unibrow
260 
261 #endif // V8_UNICODE_H_
static const unsigned kMaxChar
Definition: unicode.h:118
static uint16_t ConvertNonLatin1ToLatin1(uint16_t)
Definition: unicode-inl.h:60
friend class Test
Definition: unicode.h:57
static const int kMask
Definition: unicode.h:69
int CalculateValue(uchar c, uchar n, uchar *result)
Definition: unicode-inl.h:42
CacheEntry entries_[kSize]
Definition: unicode.h:70
static const int kSize
Definition: unicode.h:68
int get(uchar c, uchar n, uchar *result)
Definition: unicode-inl.h:27
friend class Test
Definition: unicode.h:32
bool get(uchar c)
Definition: unicode-inl.h:14
bool CalculateValue(uchar c)
Definition: unicode-inl.h:20
CacheEntry entries_[kSize]
Definition: unicode.h:44
static const int kMask
Definition: unicode.h:43
static const int kSize
Definition: unicode.h:42
static const uchar kMaxCodePoint
Definition: unicode.h:77
friend class Test
Definition: unicode.h:75
static int GetByteCount()
Definition: unicode.cc:1774
static uint16_t LeadSurrogate(uint32_t char_code)
Definition: unicode.h:108
static const int kNoPreviousCharacter
Definition: unicode.h:97
static const uchar kMaxNonSurrogateCharCode
Definition: unicode.h:98
static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit
Definition: unicode.h:104
static uint16_t TrailSurrogate(uint32_t char_code)
Definition: unicode.h:111
static bool IsSurrogatePair(int lead, int trail)
Definition: unicode.h:82
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition: unicode.h:94
static bool IsTrailSurrogate(int code)
Definition: unicode.h:89
static const int kUtf8BytesToCodeASurrogate
Definition: unicode.h:107
static bool IsLeadSurrogate(int code)
Definition: unicode.h:85
unsigned Utf16Length() const
Definition: unicode.h:167
DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase)
bool last_byte_of_buffer_unused_
Definition: unicode.h:180
const uint8_t * unbuffered_start_
Definition: unicode.h:178
static void WriteUtf16Slow(const uint8_t *stream, uint16_t *data, unsigned length)
Definition: unicode.cc:308
void Reset(uint16_t *buffer, unsigned buffer_length, const uint8_t *stream, unsigned stream_length)
Definition: unicode.cc:261
void Reset(const char *stream, unsigned length)
Definition: unicode-inl.h:177
unsigned WriteUtf16(uint16_t *data, unsigned length) const
Definition: unicode-inl.h:185
uint16_t buffer_[kBufferSize]
Definition: unicode.h:193
static const unsigned kMaxThreeByteChar
Definition: unicode.h:143
static const unsigned kMaxOneByteChar
Definition: unicode.h:141
static uchar ValueOf(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode-inl.h:129
static const uchar kBadChar
Definition: unicode.h:139
static uchar Length(uchar chr, int previous)
Definition: unicode-inl.h:140
static const unsigned kMaxFourByteChar
Definition: unicode.h:144
static const unsigned kMaxEncodedSize
Definition: unicode.h:140
static unsigned EncodeOneByte(char *out, uint8_t c)
Definition: unicode-inl.h:76
static const unsigned kMaxTwoByteChar
Definition: unicode.h:142
static const unsigned kMax16BitCodeUnitSize
Definition: unicode.h:152
static unsigned Encode(char *out, uchar c, int previous, bool replace_invalid=false)
Definition: unicode-inl.h:91
static const unsigned kBytesSavedByCombiningSurrogates
Definition: unicode.h:148
static const unsigned kSizeOfUnmatchedSurrogate
Definition: unicode.h:149
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:191
enable harmony numeric enable harmony object literal extensions Optimize object size
unsigned short uint16_t
Definition: unicode.cc:23
const int kMaxMappingSize
The max length of the result of converting the case of a single character.
Definition: unicode.h:24
unsigned int uchar
Definition: unicode.h:17
unsigned char byte
Definition: unicode.h:18
static const int kMaxWidth
Definition: unicode.h:252
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1740
static bool Is(uchar c)
Definition: unicode.cc:821
static bool Is(uchar c)
Definition: unicode.cc:852
static const int kMaxWidth
Definition: unicode.h:238
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1284
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1675
static const int kMaxWidth
Definition: unicode.h:245
static bool Is(uchar c)
Definition: unicode.cc:658
static bool Is(uchar c)
Definition: unicode.cc:755
static bool Is(uchar c)
Definition: unicode.cc:533
static const int kNoChar
Definition: unicode.h:66
CacheEntry(uchar code_point, signed offset)
Definition: unicode.h:61
static bool Is(uchar c)
Definition: unicode.cc:708
CacheEntry(uchar code_point, bool value)
Definition: unicode.h:36
static const bool kIsToLower
Definition: unicode.h:223
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:972
static const int kMaxWidth
Definition: unicode.h:222
static const bool kIsToLower
Definition: unicode.h:231
static const int kMaxWidth
Definition: unicode.h:230
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1143
static bool Is(uchar c)
Definition: unicode.cc:421
static bool Is(uchar c)
Definition: unicode.cc:733