diff options
Diffstat (limited to 'vendor/unicode-width/scripts/unicode.py')
-rwxr-xr-x | vendor/unicode-width/scripts/unicode.py | 505 |
1 files changed, 0 insertions, 505 deletions
diff --git a/vendor/unicode-width/scripts/unicode.py b/vendor/unicode-width/scripts/unicode.py deleted file mode 100755 index a6d58c5..0000000 --- a/vendor/unicode-width/scripts/unicode.py +++ /dev/null @@ -1,505 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT -# file at the top-level directory of this distribution and at -# http://rust-lang.org/COPYRIGHT. -# -# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - -# This script uses the following Unicode tables: -# - EastAsianWidth.txt -# - ReadMe.txt -# - UnicodeData.txt -# -# Since this should not require frequent updates, we just store this -# out-of-line and check the generated module into git. - -import enum -import math -import os -import re -import sys - -NUM_CODEPOINTS = 0x110000 -"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" - -MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1)) -"""The maximum number of bits required to represent a Unicode codepoint.""" - - -class OffsetType(enum.IntEnum): - """Represents the data type of a lookup table's offsets. Each variant's value represents the - number of bits required to represent that variant's type.""" - - U2 = 2 - """Offsets are 2-bit unsigned integers, packed four-per-byte.""" - U4 = 4 - """Offsets are 4-bit unsigned integers, packed two-per-byte.""" - U8 = 8 - """Each offset is a single byte (u8).""" - - -TABLE_CFGS = [ - (13, MAX_CODEPOINT_BITS, OffsetType.U8), - (6, 13, OffsetType.U8), - (0, 6, OffsetType.U2), -] -"""Represents the format of each level of the multi-level lookup table. -A level's entry is of the form `(low_bit, cap_bit, offset_type)`. -This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the -codepoint and those tables offsets are stored according to `offset_type`. - -If this is edited, you must ensure that `emit_module` reflects your changes.""" - -MODULE_FILENAME = "tables.rs" -"""The filename of the emitted Rust module (will be created in the working directory)""" - -Codepoint = int -BitPos = int - - -def fetch_open(filename: str): - """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, - fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.""" - if not os.path.exists(os.path.basename(filename)): - os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}") - try: - return open(filename, encoding="utf-8") - except OSError: - sys.stderr.write(f"cannot load {filename}") - sys.exit(1) - - -def load_unicode_version() -> "tuple[int, int, int]": - """Returns the current Unicode version by fetching and processing `ReadMe.txt`.""" - with fetch_open("ReadMe.txt") as readme: - pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" - return tuple(map(int, re.search(pattern, readme.read()).groups())) - - -class EffectiveWidth(enum.IntEnum): - """Represents the width of a Unicode character. All East Asian Width classes resolve into - either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.""" - - ZERO = 0 - """ Zero columns wide. """ - NARROW = 1 - """ One column wide. """ - WIDE = 2 - """ Two columns wide. """ - AMBIGUOUS = 3 - """ Two columns wide in a CJK context. One column wide in all other contexts. """ - - -def load_east_asian_widths() -> "list[EffectiveWidth]": - """Return a list of effective widths, indexed by codepoint. - Widths are determined by fetching and parsing `EastAsianWidth.txt`. - - `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`. - - `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`. - - `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`.""" - with fetch_open("EastAsianWidth.txt") as eaw: - # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." - single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)") - # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." - multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)") - # map between width category code and condensed width - width_codes = { - **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]}, - **{c: EffectiveWidth.WIDE for c in ["W", "F"]}, - "A": EffectiveWidth.AMBIGUOUS, - } - - width_map = [] - current = 0 - for line in eaw.readlines(): - raw_data = None # (low, high, width) - if match := single.match(line): - raw_data = (match.group(1), match.group(1), match.group(2)) - elif match := multiple.match(line): - raw_data = (match.group(1), match.group(2), match.group(3)) - else: - continue - low = int(raw_data[0], 16) - high = int(raw_data[1], 16) - width = width_codes[raw_data[2]] - - assert current <= high - while current <= high: - # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt. - # All such codepoints are implicitly given Neural width (resolves to narrow) - width_map.append(EffectiveWidth.NARROW if current < low else width) - current += 1 - - while len(width_map) < NUM_CODEPOINTS: - # Catch any leftover codepoints and assign them implicit Neutral/narrow width. - width_map.append(EffectiveWidth.NARROW) - - return width_map - - -def load_zero_widths() -> "list[bool]": - """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width - character. `c` is considered a zero-width character if `c` is in general categories - `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`).""" - with fetch_open("UnicodeData.txt") as categories: - zw_map = [] - current = 0 - for line in categories.readlines(): - if len(raw_data := line.split(";")) != 15: - continue - [codepoint, name, cat_code] = [ - int(raw_data[0], 16), - raw_data[1], - raw_data[2], - ] - zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"] - - assert current <= codepoint - while current <= codepoint: - if name.endswith(", Last>") or current == codepoint: - # if name ends with Last, we backfill the width value to all codepoints since - # the previous codepoint (aka the start of the range) - zw_map.append(zero_width) - else: - # unassigned characters are implicitly given Neutral width, which is nonzero - zw_map.append(False) - current += 1 - - while len(zw_map) < NUM_CODEPOINTS: - # Catch any leftover codepoints. They must be unassigned (so nonzero width) - zw_map.append(False) - - return zw_map - - -class Bucket: - """A bucket contains a group of codepoints and an ordered width list. If one bucket's width - list overlaps with another's width list, those buckets can be merged via `try_extend`.""" - - def __init__(self): - """Creates an empty bucket.""" - self.entry_set = set() - self.widths = [] - - def append(self, codepoint: Codepoint, width: EffectiveWidth): - """Adds a codepoint/width pair to the bucket, and appends `width` to the width list.""" - self.entry_set.add((codepoint, width)) - self.widths.append(width) - - def try_extend(self, attempt: "Bucket") -> bool: - """If either `self` or `attempt`'s width list starts with the other bucket's width list, - set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints - into `self`, and return `True`. Otherwise, return `False`.""" - (less, more) = (self.widths, attempt.widths) - if len(self.widths) > len(attempt.widths): - (less, more) = (attempt.widths, self.widths) - if less != more[: len(less)]: - return False - self.entry_set |= attempt.entry_set - self.widths = more - return True - - def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]": - """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint.""" - result = list(self.entry_set) - result.sort() - return result - - def width(self) -> "EffectiveWidth": - """If all codepoints in this bucket have the same width, return that width; otherwise, - return `None`.""" - if len(self.widths) == 0: - return None - potential_width = self.widths[0] - for width in self.widths[1:]: - if potential_width != width: - return None - return potential_width - - -def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]": - """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All - codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the - same bucket. Returns a list of the buckets in increasing order of those bits.""" - num_bits = cap_bit - low_bit - assert num_bits > 0 - buckets = [Bucket() for _ in range(0, 2 ** num_bits)] - mask = (1 << num_bits) - 1 - for (codepoint, width) in entries: - buckets[(codepoint >> low_bit) & mask].append(codepoint, width) - return buckets - - -class Table: - """Represents a lookup table. Each table contains a certain number of subtables; each - subtable is indexed by a contiguous bit range of the codepoint and contains a list - of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.) - - Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should - be indexed by sub-table `i` in the next-level lookup table. The entries of this table are - indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The - key to compression is that two different buckets in two different sub-tables may have the - same width list, which means that they can be merged into the same bucket. - - If no bucket contains two codepoints with different widths, calling `indices_to_widths` will - discard the buckets and convert the entries into `EffectiveWidth` values.""" - - def __init__( - self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType - ): - """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator - in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`, - and each table entry is represented in the format specified by `offset_type`. Asserts - that this table is actually representable with `offset_type`.""" - self.low_bit = low_bit - self.cap_bit = cap_bit - self.offset_type = offset_type - self.entries = [] - self.indexed = [] - - buckets = [] - for entries in entry_groups: - buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) - - for bucket in buckets: - for (i, existing) in enumerate(self.indexed): - if existing.try_extend(bucket): - self.entries.append(i) - break - else: - self.entries.append(len(self.indexed)) - self.indexed.append(bucket) - - # Validate offset type - for index in self.entries: - assert index < (1 << int(self.offset_type)) - - def indices_to_widths(self): - """Destructively converts the indices in this table to the `EffectiveWidth` values of - their buckets. Assumes that no bucket contains codepoints with different widths.""" - self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries)) - del self.indexed - - def buckets(self): - """Returns an iterator over this table's buckets.""" - return self.indexed - - def to_bytes(self) -> "list[int]": - """Returns this table's entries as a list of bytes. The bytes are formatted according to - the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries - to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will - contain four packed 2-bit entries.""" - entries_per_byte = 8 // int(self.offset_type) - byte_array = [] - for i in range(0, len(self.entries), entries_per_byte): - byte = 0 - for j in range(0, entries_per_byte): - byte |= self.entries[i + j] << (j * int(self.offset_type)) - byte_array.append(byte) - return byte_array - - -def make_tables( - table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries -) -> "list[Table]": - """Creates a table for each configuration in `table_cfgs`, with the first config corresponding - to the top-level lookup table, the second config corresponding to the second-level lookup - table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs - to include in the top-level table.""" - tables = [] - entry_groups = [entries] - for (low_bit, cap_bit, offset_type) in table_cfgs: - table = Table(entry_groups, low_bit, cap_bit, offset_type) - entry_groups = map(lambda bucket: bucket.entries(), table.buckets()) - tables.append(table) - return tables - - -def emit_module( - out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]" -): - """Outputs a Rust module to `out_name` using table data from `tables`. - If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.""" - if os.path.exists(out_name): - os.remove(out_name) - with open(out_name, "w", newline="\n", encoding="utf-8") as module: - module.write( - """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly -""" - ) - module.write( - f""" -/// The version of [Unicode](http://www.unicode.org/) -/// that this version of unicode-width is based on. -pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version}; -""" - ) - - module.write( - """ -pub mod charwidth { - use core::option::Option::{self, None, Some}; - - /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by - /// consulting a multi-level lookup table. - /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, - /// they're treated as single width. - /// - /// # Maintenance - /// The tables themselves are autogenerated but this function is hardcoded. You should have - /// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) - /// However, if you change the *actual structure* of the lookup tables (perhaps by editing the - /// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes. - #[inline] - fn lookup_width(c: char, is_cjk: bool) -> usize { - let cp = c as usize; - - let t1_offset = TABLES_0[cp >> 13 & 0xFF]; - - // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte, - // so each sub-table is 128 bytes in size. - // (Sub-tables are selected using the computed offset from the previous table.) - let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)]; - - // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits. - // This is accomplished by packing four stored entries into one byte. - // So each sub-table is 2**(6-2) == 16 bytes in size. - // Since this is the last table, each entry represents an encoded width. - let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)]; - - // Extract the packed width - let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; - - // A width of 3 signifies that the codepoint is ambiguous width. - if width == 3 { - if is_cjk { - 2 - } else { - 1 - } - } else { - width.into() - } - } -""" - ) - - module.write( - """ - /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or - /// `None` if `c` is a control character other than `'\\x00'`. - /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, - /// they're treated as single width. - #[inline] - pub fn width(c: char, is_cjk: bool) -> Option<usize> { - if c < '\\u{7F}' { - if c >= '\\u{20}' { - // U+0020 to U+007F (exclusive) are single-width ASCII codepoints - Some(1) - } else if c == '\\0' { - // U+0000 *is* a control code, but it's special-cased - Some(0) - } else { - // U+0001 to U+0020 (exclusive) are control codes - None - } - } else if c >= '\\u{A0}' { - // No characters >= U+00A0 are control codes, so we can consult the lookup tables - Some(lookup_width(c, is_cjk)) - } else { - // U+007F to U+00A0 (exclusive) are control codes - None - } - } -""" - ) - - subtable_count = 1 - for (i, table) in enumerate(tables): - new_subtable_count = len(table.buckets()) - if i == len(tables) - 1: - table.indices_to_widths() # for the last table, indices == widths - byte_array = table.to_bytes() - module.write( - f""" - /// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info. - static TABLES_{i}: [u8; {len(byte_array)}] = [""" - ) - for (j, byte) in enumerate(byte_array): - # Add line breaks for every 15th entry (chosen to match what rustfmt does) - if j % 15 == 0: - module.write("\n ") - module.write(f" 0x{byte:02X},") - module.write("\n ];\n") - subtable_count = new_subtable_count - module.write("}\n") - - -def main(module_filename: str): - """Obtain character data from the latest version of Unicode, transform it into a multi-level - lookup table for character width, and write a Rust module utilizing that table to - `module_filename`. - - We obey the following rules in decreasing order of importance: - - The soft hyphen (`U+00AD`) is single-width. - - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width. - - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width. - - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. - - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. - - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width - of `Neutral`, `Narrow`, or `Halfwidth`) are single-width. - - These rules are based off of Markus Kuhn's free `wcwidth()` implementation: - http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c""" - version = load_unicode_version() - print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") - - eaw_map = load_east_asian_widths() - zw_map = load_zero_widths() - - # Characters marked as zero-width in zw_map should be zero-width in the final map - width_map = list( - map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map)) - ) - - # Override for soft hyphen - width_map[0x00AD] = EffectiveWidth.NARROW - - # Override for Hangul Jamo medial vowels & final consonants - for i in range(0x1160, 0x11FF + 1): - width_map[i] = EffectiveWidth.ZERO - - tables = make_tables(TABLE_CFGS, enumerate(width_map)) - - print("------------------------") - total_size = 0 - for (i, table) in enumerate(tables): - size_bytes = len(table.to_bytes()) - print(f"Table {i} Size: {size_bytes} bytes") - total_size += size_bytes - print("------------------------") - print(f" Total Size: {total_size} bytes") - - emit_module(module_filename, version, tables) - print(f'Wrote to "{module_filename}"') - - -if __name__ == "__main__": - main(MODULE_FILENAME) |