Boost.Nowide
utf8_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
9 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
10 
11 #include <boost/nowide/detail/utf.hpp>
13 #include <boost/cstdint.hpp>
14 #include <boost/static_assert.hpp>
15 #include <locale>
16 
17 namespace boost {
18 namespace nowide {
19 
20  //
21  // Make sure that mbstate can keep 16 bit of UTF-16 sequence
22  //
23  BOOST_STATIC_ASSERT(sizeof(std::mbstate_t) >= 2);
24  namespace detail {
25  // Avoid including cstring for std::memcpy
26  inline void copy_uint16_t(void* dst, const void* src)
27  {
28  unsigned char* cdst = static_cast<unsigned char*>(dst);
29  const unsigned char* csrc = static_cast<const unsigned char*>(src);
30  cdst[0] = csrc[0];
31  cdst[1] = csrc[1];
32  }
33  inline boost::uint16_t read_state(const std::mbstate_t& src)
34  {
35  boost::uint16_t dst;
36  copy_uint16_t(&dst, &src);
37  return dst;
38  }
39  inline void write_state(std::mbstate_t& dst, const boost::uint16_t src)
40  {
41  copy_uint16_t(&dst, &src);
42  }
43  } // namespace detail
44 
45 #if defined _MSC_VER && _MSC_VER < 1700
46 // MSVC do_length is non-standard it counts wide characters instead of narrow and does not change mbstate
47 #define BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
48 #endif
49 
56  template<typename CharType, int CharSize = sizeof(CharType)>
57  class utf8_codecvt;
58 
59  template<typename CharType>
60  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
61  {
62  public:
63  BOOST_STATIC_ASSERT_MSG(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
64 
65  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
66  {}
67 
68  protected:
69  typedef CharType uchar;
70 
71  virtual std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const
72  {
73  if(detail::read_state(s) != 0)
74  return std::codecvt_base::error;
75  next = from;
76  return std::codecvt_base::ok;
77  }
78  virtual int do_encoding() const throw()
79  {
80  return 0;
81  }
82  virtual int do_max_length() const throw()
83  {
84  return 4;
85  }
86  virtual bool do_always_noconv() const throw()
87  {
88  return false;
89  }
90 
91  virtual int do_length(std::mbstate_t
92 #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
93  const
94 #endif
95  & std_state,
96  const char* from,
97  const char* from_end,
98  size_t max) const
99  {
100  boost::uint16_t state = detail::read_state(std_state);
101 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
102  const char* save_from = from;
103 #else
104  size_t save_max = max;
105 #endif
106  while(max > 0 && from < from_end)
107  {
108  const char* prev_from = from;
109  boost::uint32_t ch = detail::utf::utf_traits<char>::decode(from, from_end);
110  if(ch == detail::utf::illegal)
111  {
113  } else if(ch == detail::utf::incomplete)
114  {
115  from = prev_from;
116  break;
117  }
118  max--;
119  if(ch > 0xFFFF)
120  {
121  if(state == 0)
122  {
123  from = prev_from;
124  state = 1;
125  } else
126  {
127  state = 0;
128  }
129  }
130  }
131 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
132  detail::write_state(std_state, state);
133  return static_cast<int>(from - save_from);
134 #else
135  return static_cast<int>(save_max - max);
136 #endif
137  }
138 
139  virtual std::codecvt_base::result do_in(std::mbstate_t& std_state,
140  const char* from,
141  const char* from_end,
142  const char*& from_next,
143  uchar* to,
144  uchar* to_end,
145  uchar*& to_next) const
146  {
147  std::codecvt_base::result r = std::codecvt_base::ok;
148 
149  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
150  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
151  //
152  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
153  // and first pair is written, but no input consumed
154  boost::uint16_t state = detail::read_state(std_state);
155  while(to < to_end && from < from_end)
156  {
157  const char* from_saved = from;
158 
159  uint32_t ch = detail::utf::utf_traits<char>::decode(from, from_end);
160 
161  if(ch == detail::utf::illegal)
162  {
164  } else if(ch == detail::utf::incomplete)
165  {
166  from = from_saved;
167  r = std::codecvt_base::partial;
168  break;
169  }
170  // Normal codepoints go directly to stream
171  if(ch <= 0xFFFF)
172  {
173  *to++ = static_cast<CharType>(ch);
174  } else
175  {
176  // for other codepoints we do following
177  //
178  // 1. We can't consume our input as we may find ourself
179  // in state where all input consumed but not all output written,i.e. only
180  // 1st pair is written
181  // 2. We only write first pair and mark this in the state, we also revert back
182  // the from pointer in order to make sure this codepoint would be read
183  // once again and then we would consume our input together with writing
184  // second surrogate pair
185  ch -= 0x10000;
186  boost::uint16_t vh = static_cast<boost::uint16_t>(ch >> 10);
187  boost::uint16_t vl = ch & 0x3FF;
188  boost::uint16_t w1 = vh + 0xD800;
189  boost::uint16_t w2 = vl + 0xDC00;
190  if(state == 0)
191  {
192  from = from_saved;
193  *to++ = static_cast<CharType>(w1);
194  state = 1;
195  } else
196  {
197  *to++ = static_cast<CharType>(w2);
198  state = 0;
199  }
200  }
201  }
202  from_next = from;
203  to_next = to;
204  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
205  r = std::codecvt_base::partial;
206  detail::write_state(std_state, state);
207  return r;
208  }
209 
210  virtual std::codecvt_base::result do_out(std::mbstate_t& std_state,
211  const uchar* from,
212  const uchar* from_end,
213  const uchar*& from_next,
214  char* to,
215  char* to_end,
216  char*& to_next) const
217  {
218  std::codecvt_base::result r = std::codecvt_base::ok;
219  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
220  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
221  // to be able to store first observed surrogate pair
222  //
223  // State: state!=0 - a first surrogate pair was observed (state = first pair),
224  // we expect the second one to come and then zero the state
226  boost::uint16_t state = detail::read_state(std_state);
227  while(to < to_end && from < from_end)
228  {
229  boost::uint32_t ch = 0;
230  if(state != 0)
231  {
232  // if the state indicates that 1st surrogate pair was written
233  // we should make sure that the second one that comes is actually
234  // second surrogate
235  boost::uint16_t w1 = state;
236  boost::uint16_t w2 = *from;
237  // we don't forward from as writing may fail to incomplete or
238  // partial conversion
239  if(0xDC00 <= w2 && w2 <= 0xDFFF)
240  {
241  boost::uint16_t vh = w1 - 0xD800;
242  boost::uint16_t vl = w2 - 0xDC00;
243  ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
244  } else
245  {
247  }
248  } else
249  {
250  ch = *from;
251  if(0xD800 <= ch && ch <= 0xDBFF)
252  {
253  // if this is a first surrogate pair we put
254  // it into the state and consume it, note we don't
255  // go forward as it should be illegal so we increase
256  // the from pointer manually
257  state = static_cast<boost::uint16_t>(ch);
258  from++;
259  continue;
260  } else if(0xDC00 <= ch && ch <= 0xDFFF)
261  {
262  // if we observe second surrogate pair and
263  // first only may be expected we should break from the loop with error
264  // as it is illegal input
266  }
267  }
268  if(!detail::utf::is_valid_codepoint(ch))
269  {
270  r = std::codecvt_base::error;
271  break;
272  }
273  int len = detail::utf::utf_traits<char>::width(ch);
274  if(to_end - to < len)
275  {
276  r = std::codecvt_base::partial;
277  break;
278  }
279  to = detail::utf::utf_traits<char>::encode(ch, to);
280  state = 0;
281  from++;
282  }
283  from_next = from;
284  to_next = to;
285  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
286  r = std::codecvt_base::partial;
287  detail::write_state(std_state, state);
288  return r;
289  }
290  };
291 
292  template<typename CharType>
293  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
294  {
295  public:
296  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
297  {}
298 
299  protected:
300  typedef CharType uchar;
301 
302  virtual std::codecvt_base::result do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const
303  {
304  next = from;
305  return std::codecvt_base::ok;
306  }
307  virtual int do_encoding() const throw()
308  {
309  return 0;
310  }
311  virtual int do_max_length() const throw()
312  {
313  return 4;
314  }
315  virtual bool do_always_noconv() const throw()
316  {
317  return false;
318  }
319 
320  virtual int do_length(std::mbstate_t
321 #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
322  const
323 #endif
324  & /*state*/,
325  const char* from,
326  const char* from_end,
327  size_t max) const
328  {
329 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
330  const char* start_from = from;
331 #else
332  size_t save_max = max;
333 #endif
334 
335  while(max > 0 && from < from_end)
336  {
337  const char* save_from = from;
338  boost::uint32_t ch = detail::utf::utf_traits<char>::decode(from, from_end);
339  if(ch == detail::utf::incomplete)
340  {
341  from = save_from;
342  break;
343  } else if(ch == detail::utf::illegal)
344  {
346  }
347  max--;
348  }
349 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
350  return from - start_from;
351 #else
352  return save_max - max;
353 #endif
354  }
355 
356  virtual std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
357  const char* from,
358  const char* from_end,
359  const char*& from_next,
360  uchar* to,
361  uchar* to_end,
362  uchar*& to_next) const
363  {
364  std::codecvt_base::result r = std::codecvt_base::ok;
365 
366  while(to < to_end && from < from_end)
367  {
368  const char* from_saved = from;
369 
370  uint32_t ch = detail::utf::utf_traits<char>::decode(from, from_end);
371 
372  if(ch == detail::utf::illegal)
373  {
375  } else if(ch == detail::utf::incomplete)
376  {
377  r = std::codecvt_base::partial;
378  from = from_saved;
379  break;
380  }
381  *to++ = ch;
382  }
383  from_next = from;
384  to_next = to;
385  if(r == std::codecvt_base::ok && from != from_end)
386  r = std::codecvt_base::partial;
387  return r;
388  }
389 
390  virtual std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
391  const uchar* from,
392  const uchar* from_end,
393  const uchar*& from_next,
394  char* to,
395  char* to_end,
396  char*& to_next) const
397  {
398  std::codecvt_base::result r = std::codecvt_base::ok;
399  while(to < to_end && from < from_end)
400  {
401  boost::uint32_t ch = 0;
402  ch = *from;
403  if(!detail::utf::is_valid_codepoint(ch))
404  {
406  }
407  int len = detail::utf::utf_traits<char>::width(ch);
408  if(to_end - to < len)
409  {
410  r = std::codecvt_base::partial;
411  break;
412  }
413  to = detail::utf::utf_traits<char>::encode(ch, to);
414  from++;
415  }
416  from_next = from;
417  to_next = to;
418  if(r == std::codecvt_base::ok && from != from_end)
419  r = std::codecvt_base::partial;
420  return r;
421  }
422  };
423 
424 } // namespace nowide
425 } // namespace boost
426 
427 #endif
Definition: utf8_codecvt.hpp:57
#define BOOST_NOWIDE_REPLACEMENT_CHARACTER
Definition: replacement.hpp:16