All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
encodedstream.h
1 // Tencent is pleased to support the open source community by making RapidJSON available.
2 //
3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4 //
5 // Licensed under the MIT License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // http://opensource.org/licenses/MIT
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifndef RAPIDJSON_ENCODEDSTREAM_H_
16 #define RAPIDJSON_ENCODEDSTREAM_H_
17 
18 #include "rapidjson.h"
19 
20 #ifdef __GNUC__
21 RAPIDJSON_DIAG_PUSH
22 RAPIDJSON_DIAG_OFF(effc++)
23 #endif
24 
25 RAPIDJSON_NAMESPACE_BEGIN
26 
27 //! Input byte stream wrapper with a statically bound encoding.
28 /*!
29  \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
30  \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
31 */
32 template <typename Encoding, typename InputByteStream>
34  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
35 public:
36  typedef typename Encoding::Ch Ch;
37 
38  EncodedInputStream(InputByteStream& is) : is_(is) {
39  current_ = Encoding::TakeBOM(is_);
40  }
41 
42  Ch Peek() const { return current_; }
43  Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
44  size_t Tell() const { return is_.Tell(); }
45 
46  // Not implemented
47  void Put(Ch) { RAPIDJSON_ASSERT(false); }
48  void Flush() { RAPIDJSON_ASSERT(false); }
49  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
50  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
51 
52 private:
54  EncodedInputStream& operator=(const EncodedInputStream&);
55 
56  InputByteStream& is_;
57  Ch current_;
58 };
59 
60 //! Output byte stream wrapper with statically bound encoding.
61 /*!
62  \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
63  \tparam InputByteStream Type of input byte stream. For example, FileWriteStream.
64 */
65 template <typename Encoding, typename OutputByteStream>
67  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
68 public:
69  typedef typename Encoding::Ch Ch;
70 
71  EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
72  if (putBOM)
73  Encoding::PutBOM(os_);
74  }
75 
76  void Put(Ch c) { Encoding::Put(os_, c); }
77  void Flush() { os_.Flush(); }
78 
79  // Not implemented
80  Ch Peek() const { RAPIDJSON_ASSERT(false); }
81  Ch Take() { RAPIDJSON_ASSERT(false); }
82  size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
83  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
84  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
85 
86 private:
88  EncodedOutputStream& operator=(const EncodedOutputStream&);
89 
90  OutputByteStream& os_;
91 };
92 
93 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
94 
95 //! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
96 /*!
97  \tparam CharType Type of character for reading.
98  \tparam InputByteStream type of input byte stream to be wrapped.
99 */
100 template <typename CharType, typename InputByteStream>
102  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
103 public:
104  typedef CharType Ch;
105 
106  //! Constructor.
107  /*!
108  \param is input stream to be wrapped.
109  \param type UTF encoding type if it is not detected from the stream.
110  */
111  AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
112  RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
113  DetectType();
114  static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
115  takeFunc_ = f[type_];
116  current_ = takeFunc_(*is_);
117  }
118 
119  UTFType GetType() const { return type_; }
120  bool HasBOM() const { return hasBOM_; }
121 
122  Ch Peek() const { return current_; }
123  Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
124  size_t Tell() const { return is_->Tell(); }
125 
126  // Not implemented
127  void Put(Ch) { RAPIDJSON_ASSERT(false); }
128  void Flush() { RAPIDJSON_ASSERT(false); }
129  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
130  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
131 
132 private:
133  AutoUTFInputStream(const AutoUTFInputStream&);
134  AutoUTFInputStream& operator=(const AutoUTFInputStream&);
135 
136  // Detect encoding type with BOM or RFC 4627
137  void DetectType() {
138  // BOM (Byte Order Mark):
139  // 00 00 FE FF UTF-32BE
140  // FF FE 00 00 UTF-32LE
141  // FE FF UTF-16BE
142  // FF FE UTF-16LE
143  // EF BB BF UTF-8
144 
145  const unsigned char* c = (const unsigned char *)is_->Peek4();
146  if (!c)
147  return;
148 
149  unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
150  hasBOM_ = false;
151  if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
152  else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
153  else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); }
154  else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); }
155  else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); }
156 
157  // RFC 4627: Section 3
158  // "Since the first two characters of a JSON text will always be ASCII
159  // characters [RFC0020], it is possible to determine whether an octet
160  // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
161  // at the pattern of nulls in the first four octets."
162  // 00 00 00 xx UTF-32BE
163  // 00 xx 00 xx UTF-16BE
164  // xx 00 00 00 UTF-32LE
165  // xx 00 xx 00 UTF-16LE
166  // xx xx xx xx UTF-8
167 
168  if (!hasBOM_) {
169  unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
170  switch (pattern) {
171  case 0x08: type_ = kUTF32BE; break;
172  case 0x0A: type_ = kUTF16BE; break;
173  case 0x01: type_ = kUTF32LE; break;
174  case 0x05: type_ = kUTF16LE; break;
175  case 0x0F: type_ = kUTF8; break;
176  default: break; // Use type defined by user.
177  }
178  }
179 
180  // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
181  if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
182  if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
183  }
184 
185  typedef Ch (*TakeFunc)(InputByteStream& is);
186  InputByteStream* is_;
187  UTFType type_;
188  Ch current_;
189  TakeFunc takeFunc_;
190  bool hasBOM_;
191 };
192 
193 //! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
194 /*!
195  \tparam CharType Type of character for writing.
196  \tparam InputByteStream type of output byte stream to be wrapped.
197 */
198 template <typename CharType, typename OutputByteStream>
200  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
201 public:
202  typedef CharType Ch;
203 
204  //! Constructor.
205  /*!
206  \param os output stream to be wrapped.
207  \param type UTF encoding type.
208  \param putBOM Whether to write BOM at the beginning of the stream.
209  */
210  AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
211  RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
212 
213  // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
214  if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
215  if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
216 
217  static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
218  putFunc_ = f[type_];
219 
220  if (putBOM)
221  PutBOM();
222  }
223 
224  UTFType GetType() const { return type_; }
225 
226  void Put(Ch c) { putFunc_(*os_, c); }
227  void Flush() { os_->Flush(); }
228 
229  // Not implemented
230  Ch Peek() const { RAPIDJSON_ASSERT(false); }
231  Ch Take() { RAPIDJSON_ASSERT(false); }
232  size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
233  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
234  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
235 
236 private:
237  AutoUTFOutputStream(const AutoUTFOutputStream&);
238  AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
239 
240  void PutBOM() {
241  typedef void (*PutBOMFunc)(OutputByteStream&);
242  static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
243  f[type_](*os_);
244  }
245 
246  typedef void (*PutFunc)(OutputByteStream&, Ch);
247 
248  OutputByteStream* os_;
249  UTFType type_;
250  PutFunc putFunc_;
251 };
252 
253 #undef RAPIDJSON_ENCODINGS_FUNC
254 
255 RAPIDJSON_NAMESPACE_END
256 
257 #ifdef __GNUC__
258 RAPIDJSON_DIAG_POP
259 #endif
260 
261 #endif // RAPIDJSON_FILESTREAM_H_
UTF-16 little endian.
Definition: encodings.h:540
AutoUTFOutputStream(OutputByteStream &os, UTFType type, bool putBOM)
Constructor.
Definition: encodedstream.h:210
UTF-32 little endian.
Definition: encodings.h:542
#define RAPIDJSON_STATIC_ASSERT(x)
(Internal) macro to check for conditions at compile-time
Definition: rapidjson.h:375
Output byte stream wrapper with statically bound encoding.
Definition: encodedstream.h:66
Input stream wrapper with dynamically bound encoding and automatic encoding detection.
Definition: encodedstream.h:101
UTF-8.
Definition: encodings.h:539
AutoUTFInputStream(InputByteStream &is, UTFType type=kUTF8)
Constructor.
Definition: encodedstream.h:111
Input byte stream wrapper with a statically bound encoding.
Definition: encodedstream.h:33
UTF-16 big endian.
Definition: encodings.h:541
common definitions and configuration
Output stream wrapper with dynamically bound encoding and automatic encoding detection.
Definition: encodedstream.h:199
UTF-32 big endian.
Definition: encodings.h:543
#define RAPIDJSON_ASSERT(x)
Assertion.
Definition: rapidjson.h:344
UTFType
Runtime-specified UTF encoding type of a stream.
Definition: encodings.h:538