//! 与 UTF-8 验证有关的操作。

use crate::mem;

use super::Utf8Error;

/// 返回第一个字节的初始代码点累加器。
/// 第一个字节比较特殊，宽度 2 只需要底 5 位，宽度 3 需要 4 位，宽度 3 位 4.
///
#[inline]
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
    (byte & (0x7F >> width)) as u32
}

/// 返回用连续字节 `byte` 更新的 `ch` 的值。
#[inline]
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
    (ch << 6) | (byte & CONT_MASK) as u32
}

/// 检查该字节是否是 UTF-8 连续字节 (即，从 `10` 位开始)。
///
#[inline]
pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
    (byte as i8) < -64
}

/// 从字节迭代器中读取下一个代码点 (假定类似 UTF-8 的编码)。
///
///
/// # Safety
///
/// `bytes` 必须产生一个有效的类似 UTF-8 (UTF-8 或 WTF-8) 的字符串
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
    // 解码 UTF-8
    let x = *bytes.next()?;
    if x < 128 {
        return Some(x as u32);
    }

    // 从以下字节组合中解码出多字节大小写: [[[x y] z] w]
    //
    // NOTE: 性能对此处的确切公式很敏感
    let init = utf8_first_byte(x, 2);
    // SAFETY: `bytes` 产生一个类似 UTF-8 的字符串，所以迭代器必须在这里产生一个值。
    //
    let y = unsafe { *bytes.next().unwrap_unchecked() };
    let mut ch = utf8_acc_cont_byte(init, y);
    if x >= 0xE0 {
        // [[x y z] w] case
        // 0xE0 中的第 5 位.. 0xEF 始终是透明的，因此 `init` 仍然有效
        // SAFETY: `bytes` 产生一个类似 UTF-8 的字符串，所以迭代器必须在这里产生一个值。
        //
        let z = unsafe { *bytes.next().unwrap_unchecked() };
        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
        ch = init << 12 | y_z;
        if x >= 0xF0 {
            // [x y z w] 情况只使用 `init` 的低 3 位
            // SAFETY: `bytes` 产生一个类似 UTF-8 的字符串，所以迭代器必须在这里产生一个值。
            //
            //
            let w = unsafe { *bytes.next().unwrap_unchecked() };
            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
        }
    }

    Some(ch)
}

/// 从字节迭代器中读取最后一个代码点 (假定类似 UTF-8 的编码)。
///
///
/// # Safety
///
/// `bytes` 必须产生一个有效的类似 UTF-8 (UTF-8 或 WTF-8) 的字符串
#[inline]
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
where
    I: DoubleEndedIterator<Item = &'a u8>,
{
    // 解码 UTF-8
    let w = match *bytes.next_back()? {
        next_byte if next_byte < 128 => return Some(next_byte as u32),
        back_byte => back_byte,
    };

    // 多字节情况如下从字节组合解码: [x [y [z w]]]
    //
    let mut ch;
    // SAFETY: `bytes` 产生一个类似 UTF-8 的字符串，所以迭代器必须在这里产生一个值。
    //
    let z = unsafe { *bytes.next_back().unwrap_unchecked() };
    ch = utf8_first_byte(z, 2);
    if utf8_is_cont_byte(z) {
        // SAFETY: `bytes` 产生一个类似 UTF-8 的字符串，所以迭代器必须在这里产生一个值。
        //
        let y = unsafe { *bytes.next_back().unwrap_unchecked() };
        ch = utf8_first_byte(y, 3);
        if utf8_is_cont_byte(y) {
            // SAFETY: `bytes` 产生一个类似 UTF-8 的字符串，所以迭代器必须在这里产生一个值。
            //
            let x = unsafe { *bytes.next_back().unwrap_unchecked() };
            ch = utf8_first_byte(x, 4);
            ch = utf8_acc_cont_byte(ch, y);
        }
        ch = utf8_acc_cont_byte(ch, z);
    }
    ch = utf8_acc_cont_byte(ch, w);

    Some(ch)
}

const NONASCII_MASK: usize = usize::repeat_u8(0x80);

/// 如果单词 `x` 中的任何字节为 nonascii (>=128)，则返回 `true`。
#[inline]
const fn contains_nonascii(x: usize) -> bool {
    (x & NONASCII_MASK) != 0
}

/// 遍历 `v` 检查它是否是有效的 UTF-8 序列，在这种情况下返回 `Ok(())`，或者，如果它无效，则返回 `Err(err)`。
///
#[inline(always)]
#[rustc_const_unstable(feature = "str_internals", issue = "none")]
pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
    let mut index = 0;
    let len = v.len();

    let usize_bytes = mem::size_of::<usize>();
    let ascii_block_size = 2 * usize_bytes;
    let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
    let align = v.as_ptr().align_offset(usize_bytes);

    while index < len {
        let old_offset = index;
        macro_rules! err {
            ($error_len: expr) => {
                return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
            };
        }

        macro_rules! next {
            () => {{
                index += 1;
                // 我们需要数据，但没有数据：错误！
                if index >= len {
                    err!(None)
                }
                v[index]
            }};
        }

        let first = v[index];
        if first >= 128 {
            let w = utf8_char_width(first);
            // 2 字节编码用于代码点 \u{0080} 至 \u{07ff} 首先 C2 80 最后 DF BF
            // 3 字节编码适用于 \u {0800} 至 \u {ffff} 的代码点，第一个 E0 A0 80 最后一个 EF BF BF，不包括替代代码点 \u {d800} 至 \u {dfff} ED A0 80 到 ED BF BF
            // 4 字节编码用于 \u {1000} 0 到 \u {10ff} ff 的代码点第一个 F0 90 80 80 最后一个 F4 8F BF BF
            //
            // 使用 RFC 中的 UTF-8 语法
            //
            // https://tools.ietf.org/html/rfc3629
            // UTF8-1      = %x00-7F UTF8-2      = %xC2-DF UTF8-tail UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / %xF4 %x80-8F 2( UTF8-tail )
            //
            //
            //
            //
            //
            //
            //
            //
            //
            //
            match w {
                2 => {
                    if next!() as i8 >= -64 {
                        err!(Some(1))
                    }
                }
                3 => {
                    match (first, next!()) {
                        (0xE0, 0xA0..=0xBF)
                        | (0xE1..=0xEC, 0x80..=0xBF)
                        | (0xED, 0x80..=0x9F)
                        | (0xEE..=0xEF, 0x80..=0xBF) => {}
                        _ => err!(Some(1)),
                    }
                    if next!() as i8 >= -64 {
                        err!(Some(2))
                    }
                }
                4 => {
                    match (first, next!()) {
                        (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
                        _ => err!(Some(1)),
                    }
                    if next!() as i8 >= -64 {
                        err!(Some(2))
                    }
                    if next!() as i8 >= -64 {
                        err!(Some(3))
                    }
                }
                _ => err!(Some(1)),
            }
            index += 1;
        } else {
            // ASCII 的情况下，请尝试快速跳过。
            // 当指针对齐时，每次迭代读取 2 个字的数据，直到找到包含非 ASCII 字节的字。
            //
            if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
                let ptr = v.as_ptr();
                while index < blocks_end {
                    // SAFETY: 因为 `align - index` 和 `ascii_block_size` 是 `usize_bytes` 的倍数，所以 `block = ptr.add(index)` 总是与 `usize` 对齐，所以引用 `block` 和 `block.add(1)` 是安全的。
                    //
                    //
                    //
                    unsafe {
                        let block = ptr.add(index) as *const usize;
                        // 如果有一个非 ASCII 字节则中断
                        let zu = contains_nonascii(*block);
                        let zv = contains_nonascii(*block.add(1));
                        if zu || zv {
                            break;
                        }
                    }
                    index += ascii_block_size;
                }
                // 从逐字循环停止的位置开始
                while index < len && v[index] < 128 {
                    index += 1;
                }
            } else {
                index += 1;
            }
        }
    }

    Ok(())
}

// https://tools.ietf.org/html/rfc3629
const UTF8_CHAR_WIDTH: &[u8; 256] = &[
    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
];

/// 给定第一个字节，确定此 UTF-8 字符中有多少个字节。
#[unstable(feature = "str_internals", issue = "none")]
#[must_use]
#[inline]
pub const fn utf8_char_width(b: u8) -> usize {
    UTF8_CHAR_WIDTH[b as usize] as usize
}

/// 连续字节的值位的掩码。
const CONT_MASK: u8 = 0b0011_1111;