#![allow(unused_unsafe)]
use Utf8Char;
use Utf16Char;
use Utf8Iterator;
use Utf16Iterator;
use error::*;
extern crate std;
use std::{char,u32, mem};
use std::ops::Not;
pub trait U8UtfExt {
fn extra_utf8_bytes(self) -> Result<usize,InvalidUtf8FirstByte>;
fn extra_utf8_bytes_unchecked(self) -> usize;
}
impl U8UtfExt for u8 {
fn extra_utf8_bytes(self) -> Result<usize,InvalidUtf8FirstByte> {
use error::InvalidUtf8FirstByte::{ContinuationByte,TooLongSeqence};
match self.not().leading_zeros() {
0 => Ok(0),
1 => Err(ContinuationByte),
n if n < 5 => Ok(n as usize-1),
_ => Err(TooLongSeqence),
}
}
fn extra_utf8_bytes_unchecked(self) -> usize {
(self.not().leading_zeros()as usize).saturating_sub(1)
}
}
pub trait U16UtfExt {
fn utf16_needs_extra_unit(self) -> Option<bool>;
fn utf16_is_leading_surrogate(self) -> bool;
}
impl U16UtfExt for u16 {
fn utf16_needs_extra_unit(self) -> Option<bool> {match self {
0x_dc_00...0x_df_ff => None,
0x_d8_00...0x_db_ff => Some(true),
_ => Some(false),
}}
fn utf16_is_leading_surrogate(self) -> bool {
(self & 0xfc00) == 0xd800
}
}
pub trait CharExt: Sized {
fn to_utf8(self) -> Utf8Char;
fn to_utf16(self) -> Utf16Char;
fn iter_utf8_bytes(self) -> Utf8Iterator;
fn iter_utf16_units(self) -> Utf16Iterator;
fn to_utf8_slice(self, dst: &mut[u8]) -> Option<usize>;
fn to_utf16_slice(self, dst: &mut[u16]) -> Option<usize>;
fn to_utf8_array(self) -> ([u8; 4], usize);
fn to_utf16_tuple(self) -> (u16, Option<u16>);
fn from_utf8_slice(src: &[u8]) -> Result<(Self,usize),InvalidUtf8Slice>;
fn from_utf16_slice(src: &[u16]) -> Result<(Self,usize), InvalidUtf16Slice>;
fn from_utf8_array(utf8: [u8; 4]) -> Result<Self,InvalidUtf8Array>;
fn from_utf16_tuple(utf16: (u16, Option<u16>)) -> Result<Self, InvalidUtf16Tuple>;
unsafe fn from_utf8_exact_slice_unchecked(src: &[u8]) -> Self;
unsafe fn from_utf16_tuple_unchecked(utf16: (u16, Option<u16>)) -> Self;
fn from_u32_detailed(c: u32) -> Result<Self,InvalidCodePoint>;
}
impl CharExt for char {
fn to_utf8(self) -> Utf8Char {
self.into()
}
fn iter_utf8_bytes(self) -> Utf8Iterator {
self.to_utf8().into_iter()
}
fn to_utf8_slice(self, dst: &mut[u8]) -> Option<usize> {
self.to_utf8().to_slice(dst)
}
fn to_utf8_array(self) -> ([u8; 4], usize) {
let len = self.len_utf8();
let mut c = self as u32;
if len == 1 {
([c as u8, 0, 0, 0], 1)
} else {
let mut parts = 0;
parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f;
parts |= 0x80_80_80_80;
parts >>= 8*(4-len);
parts |= (0xff_00u32 >> len) & 0xff;
parts &= Not::not(1u32 << 7-len);
let bytes: [u8; 4] = unsafe{ mem::transmute(u32::from_le(parts)) };
(bytes, len)
}
}
fn from_utf8_slice(src: &[u8]) -> Result<(Self,usize),InvalidUtf8Slice> {
use errors::InvalidUtf8::*;
use errors::InvalidUtf8Slice::*;
let first = *try!(src.first().ok_or(TooShort(1)));
let extra = try!(first.extra_utf8_bytes().map_err(|e| Utf8(FirstByte(e)) ));
if extra == 0 {
return Ok((first as char, 1));
} else if src.len() <= extra {
return Err(TooShort(extra+1))
}
let src = &src[..1+extra];
for (i, &b) in src.iter().enumerate().skip(1) {
if b < 0b1000_0000 || b > 0b1011_1111 {
return Err(Utf8(NotAContinuationByte(i)));
}
}
if overlong(src[0], src[1]) {
return Err(Utf8(OverLong));
}
let c = unsafe{ char::from_utf8_exact_slice_unchecked(src) };
char::from_u32_detailed(c as u32).map(|c| (c,src.len()) ).map_err( CodePoint )
}
fn from_utf8_array(utf8: [u8; 4]) -> Result<Self,InvalidUtf8Array> {
use errors::InvalidUtf8::*;
use errors::InvalidUtf8Array::*;
let len = match utf8[0].extra_utf8_bytes() {
Ok(0) => return Ok(utf8[0] as char),
Ok(l) => l+1,
Err(err) => return Err(Utf8(FirstByte(err))),
};
for (i, &b) in utf8[..len].iter().enumerate().skip(1) {
if b < 0b1000_0000 || b > 0b1011_1111 {
return Err(Utf8(NotAContinuationByte(i)));
}
}
if overlong(utf8[0], utf8[1]) {
return Err(Utf8(OverLong));
}
let c = unsafe{ char::from_utf8_exact_slice_unchecked(&utf8[..len]) };
char::from_u32_detailed(c as u32).map_err( CodePoint )
}
unsafe fn from_utf8_exact_slice_unchecked(src: &[u8]) -> Self {
if src.len() == 1 {
src[0] as char
} else {
let mut c = src[0] as u32 & (0xff >> 2+src.len()-1);
for b in &src[1..] {
c = (c << 6) | (b & 0b00111111) as u32;
}
unsafe{ char::from_u32_unchecked(c) }
}
}
fn to_utf16(self) -> Utf16Char {
Utf16Char::from(self)
}
fn iter_utf16_units(self) -> Utf16Iterator {
self.to_utf16().into_iter()
}
fn to_utf16_slice(self, dst: &mut[u16]) -> Option<usize> {
let (first, second) = self.to_utf16_tuple();
match (dst.len(), second) {
(0, _) => None,
(1, Some(_)) => None,
(_, Some(second)) => {dst[0] = first;
dst[1] = second;
Some(2)
},
(_, None) => {dst[0] = first;
Some(1)
},
}
}
fn to_utf16_tuple(self) -> (u16, Option<u16>) {
let c = self as u32;
if c <= 0x_ff_ff {
(c as u16, None)
} else {
let c = c - 0x_01_00_00;
let high = 0x_d8_00 + (c >> 10);
let low = 0x_dc_00 + (c & 0x_03_ff);
(high as u16, Some(low as u16))
}
}
fn from_utf16_slice(src: &[u16]) -> Result<(Self,usize), InvalidUtf16Slice> {
use errors::InvalidUtf16Slice::*;
let first = *try!(src.first().ok_or(EmptySlice));
match (first.utf16_needs_extra_unit(), src.get(1).cloned()) {
(Some(false), _ ) => Ok((1, None)),
(Some(true) , Some(0x_dc_00...0x_df_ff)) => Ok((2, Some(src[1]))),
(Some(true) , Some( _ )) => Err(SecondNotLowSurrogate),
(Some(true) , None ) => Err(MissingSecond),
(None , _ ) => Err(FirstLowSurrogate),
}.map(|(len,second)| (unsafe{ char::from_utf16_tuple_unchecked((first,second)) }, len) )
}
fn from_utf16_tuple(utf16: (u16, Option<u16>)) -> Result<Self, InvalidUtf16Tuple> {
use errors::InvalidUtf16Tuple::*;
match utf16 {
(0x_00_00...0x_d7_ff, None) => Ok(()),
(0x_e0_00...0x_ff_ff, None) => Ok(()),
(0x_d8_00...0x_db_ff, Some(0x_dc_00...0x_df_ff)) => Ok(()),
(0x_d8_00...0x_db_ff, Some(_)) => Err(InvalidSecond),
(0x_d8_00...0x_db_ff, None) => Err(MissingSecond),
(0x_dc_00...0x_df_ff, _) => Err(FirstIsTrailingSurrogate),
(_, Some(_)) => Err(SuperfluousSecond),
(_, _) => unreachable!()
}.map(|_| unsafe{ char::from_utf16_tuple_unchecked(utf16) } )
}
unsafe fn from_utf16_tuple_unchecked(utf16: (u16, Option<u16>)) -> Self {
let mut c = utf16.0 as u32;
if let Some(second) = utf16.1 {
let high = (c-0x_d8_00) << 10;
let low = second as u32 - 0x_dc_00;
c = high | low;
c += 0x_01_00_00;
}
unsafe{ char::from_u32_unchecked(c) }
}
fn from_u32_detailed(c: u32) -> Result<Self,InvalidCodePoint> {
use errors::InvalidCodePoint::*;
match c {
0xd8_00...0xdf_ff => Err(Utf16Reserved),
0x11_00_00...u32::MAX => Err(TooHigh),
_ => Ok(unsafe{ char::from_u32_unchecked(c) }),
}
}
}
fn overlong(first: u8, second: u8) -> bool {
let both = ((first as u16) << 8) | (second << 2) as u16;
let both = both << 1+both.not().leading_zeros();
both.leading_zeros() >= 5
}