diff options
Diffstat (limited to 'vendor/unicode-width/scripts')
-rwxr-xr-x | vendor/unicode-width/scripts/unicode.py | 505 |
1 files changed, 505 insertions, 0 deletions
diff --git a/vendor/unicode-width/scripts/unicode.py b/vendor/unicode-width/scripts/unicode.py new file mode 100755 index 0000000..a6d58c5 --- /dev/null +++ b/vendor/unicode-width/scripts/unicode.py @@ -0,0 +1,505 @@ +#!/usr/bin/env python3 +# +# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode tables: +# - EastAsianWidth.txt +# - ReadMe.txt +# - UnicodeData.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the generated module into git. + +import enum +import math +import os +import re +import sys + +NUM_CODEPOINTS = 0x110000 +"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" + +MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1)) +"""The maximum number of bits required to represent a Unicode codepoint.""" + + +class OffsetType(enum.IntEnum): + """Represents the data type of a lookup table's offsets. Each variant's value represents the + number of bits required to represent that variant's type.""" + + U2 = 2 + """Offsets are 2-bit unsigned integers, packed four-per-byte.""" + U4 = 4 + """Offsets are 4-bit unsigned integers, packed two-per-byte.""" + U8 = 8 + """Each offset is a single byte (u8).""" + + +TABLE_CFGS = [ + (13, MAX_CODEPOINT_BITS, OffsetType.U8), + (6, 13, OffsetType.U8), + (0, 6, OffsetType.U2), +] +"""Represents the format of each level of the multi-level lookup table. +A level's entry is of the form `(low_bit, cap_bit, offset_type)`. +This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the +codepoint and those tables offsets are stored according to `offset_type`. + +If this is edited, you must ensure that `emit_module` reflects your changes.""" + +MODULE_FILENAME = "tables.rs" +"""The filename of the emitted Rust module (will be created in the working directory)""" + +Codepoint = int +BitPos = int + + +def fetch_open(filename: str): + """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, + fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.""" + if not os.path.exists(os.path.basename(filename)): + os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}") + try: + return open(filename, encoding="utf-8") + except OSError: + sys.stderr.write(f"cannot load {filename}") + sys.exit(1) + + +def load_unicode_version() -> "tuple[int, int, int]": + """Returns the current Unicode version by fetching and processing `ReadMe.txt`.""" + with fetch_open("ReadMe.txt") as readme: + pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" + return tuple(map(int, re.search(pattern, readme.read()).groups())) + + +class EffectiveWidth(enum.IntEnum): + """Represents the width of a Unicode character. All East Asian Width classes resolve into + either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.""" + + ZERO = 0 + """ Zero columns wide. """ + NARROW = 1 + """ One column wide. """ + WIDE = 2 + """ Two columns wide. """ + AMBIGUOUS = 3 + """ Two columns wide in a CJK context. One column wide in all other contexts. """ + + +def load_east_asian_widths() -> "list[EffectiveWidth]": + """Return a list of effective widths, indexed by codepoint. + Widths are determined by fetching and parsing `EastAsianWidth.txt`. + + `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`. + + `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`. + + `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`.""" + with fetch_open("EastAsianWidth.txt") as eaw: + # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." + single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)") + # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." + multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)") + # map between width category code and condensed width + width_codes = { + **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]}, + **{c: EffectiveWidth.WIDE for c in ["W", "F"]}, + "A": EffectiveWidth.AMBIGUOUS, + } + + width_map = [] + current = 0 + for line in eaw.readlines(): + raw_data = None # (low, high, width) + if match := single.match(line): + raw_data = (match.group(1), match.group(1), match.group(2)) + elif match := multiple.match(line): + raw_data = (match.group(1), match.group(2), match.group(3)) + else: + continue + low = int(raw_data[0], 16) + high = int(raw_data[1], 16) + width = width_codes[raw_data[2]] + + assert current <= high + while current <= high: + # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt. + # All such codepoints are implicitly given Neural width (resolves to narrow) + width_map.append(EffectiveWidth.NARROW if current < low else width) + current += 1 + + while len(width_map) < NUM_CODEPOINTS: + # Catch any leftover codepoints and assign them implicit Neutral/narrow width. + width_map.append(EffectiveWidth.NARROW) + + return width_map + + +def load_zero_widths() -> "list[bool]": + """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width + character. `c` is considered a zero-width character if `c` is in general categories + `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`).""" + with fetch_open("UnicodeData.txt") as categories: + zw_map = [] + current = 0 + for line in categories.readlines(): + if len(raw_data := line.split(";")) != 15: + continue + [codepoint, name, cat_code] = [ + int(raw_data[0], 16), + raw_data[1], + raw_data[2], + ] + zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"] + + assert current <= codepoint + while current <= codepoint: + if name.endswith(", Last>") or current == codepoint: + # if name ends with Last, we backfill the width value to all codepoints since + # the previous codepoint (aka the start of the range) + zw_map.append(zero_width) + else: + # unassigned characters are implicitly given Neutral width, which is nonzero + zw_map.append(False) + current += 1 + + while len(zw_map) < NUM_CODEPOINTS: + # Catch any leftover codepoints. They must be unassigned (so nonzero width) + zw_map.append(False) + + return zw_map + + +class Bucket: + """A bucket contains a group of codepoints and an ordered width list. If one bucket's width + list overlaps with another's width list, those buckets can be merged via `try_extend`.""" + + def __init__(self): + """Creates an empty bucket.""" + self.entry_set = set() + self.widths = [] + + def append(self, codepoint: Codepoint, width: EffectiveWidth): + """Adds a codepoint/width pair to the bucket, and appends `width` to the width list.""" + self.entry_set.add((codepoint, width)) + self.widths.append(width) + + def try_extend(self, attempt: "Bucket") -> bool: + """If either `self` or `attempt`'s width list starts with the other bucket's width list, + set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints + into `self`, and return `True`. Otherwise, return `False`.""" + (less, more) = (self.widths, attempt.widths) + if len(self.widths) > len(attempt.widths): + (less, more) = (attempt.widths, self.widths) + if less != more[: len(less)]: + return False + self.entry_set |= attempt.entry_set + self.widths = more + return True + + def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]": + """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint.""" + result = list(self.entry_set) + result.sort() + return result + + def width(self) -> "EffectiveWidth": + """If all codepoints in this bucket have the same width, return that width; otherwise, + return `None`.""" + if len(self.widths) == 0: + return None + potential_width = self.widths[0] + for width in self.widths[1:]: + if potential_width != width: + return None + return potential_width + + +def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]": + """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All + codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the + same bucket. Returns a list of the buckets in increasing order of those bits.""" + num_bits = cap_bit - low_bit + assert num_bits > 0 + buckets = [Bucket() for _ in range(0, 2 ** num_bits)] + mask = (1 << num_bits) - 1 + for (codepoint, width) in entries: + buckets[(codepoint >> low_bit) & mask].append(codepoint, width) + return buckets + + +class Table: + """Represents a lookup table. Each table contains a certain number of subtables; each + subtable is indexed by a contiguous bit range of the codepoint and contains a list + of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.) + + Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should + be indexed by sub-table `i` in the next-level lookup table. The entries of this table are + indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The + key to compression is that two different buckets in two different sub-tables may have the + same width list, which means that they can be merged into the same bucket. + + If no bucket contains two codepoints with different widths, calling `indices_to_widths` will + discard the buckets and convert the entries into `EffectiveWidth` values.""" + + def __init__( + self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType + ): + """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator + in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`, + and each table entry is represented in the format specified by `offset_type`. Asserts + that this table is actually representable with `offset_type`.""" + self.low_bit = low_bit + self.cap_bit = cap_bit + self.offset_type = offset_type + self.entries = [] + self.indexed = [] + + buckets = [] + for entries in entry_groups: + buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) + + for bucket in buckets: + for (i, existing) in enumerate(self.indexed): + if existing.try_extend(bucket): + self.entries.append(i) + break + else: + self.entries.append(len(self.indexed)) + self.indexed.append(bucket) + + # Validate offset type + for index in self.entries: + assert index < (1 << int(self.offset_type)) + + def indices_to_widths(self): + """Destructively converts the indices in this table to the `EffectiveWidth` values of + their buckets. Assumes that no bucket contains codepoints with different widths.""" + self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries)) + del self.indexed + + def buckets(self): + """Returns an iterator over this table's buckets.""" + return self.indexed + + def to_bytes(self) -> "list[int]": + """Returns this table's entries as a list of bytes. The bytes are formatted according to + the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries + to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will + contain four packed 2-bit entries.""" + entries_per_byte = 8 // int(self.offset_type) + byte_array = [] + for i in range(0, len(self.entries), entries_per_byte): + byte = 0 + for j in range(0, entries_per_byte): + byte |= self.entries[i + j] << (j * int(self.offset_type)) + byte_array.append(byte) + return byte_array + + +def make_tables( + table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries +) -> "list[Table]": + """Creates a table for each configuration in `table_cfgs`, with the first config corresponding + to the top-level lookup table, the second config corresponding to the second-level lookup + table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs + to include in the top-level table.""" + tables = [] + entry_groups = [entries] + for (low_bit, cap_bit, offset_type) in table_cfgs: + table = Table(entry_groups, low_bit, cap_bit, offset_type) + entry_groups = map(lambda bucket: bucket.entries(), table.buckets()) + tables.append(table) + return tables + + +def emit_module( + out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]" +): + """Outputs a Rust module to `out_name` using table data from `tables`. + If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.""" + if os.path.exists(out_name): + os.remove(out_name) + with open(out_name, "w", newline="\n", encoding="utf-8") as module: + module.write( + """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly +""" + ) + module.write( + f""" +/// The version of [Unicode](http://www.unicode.org/) +/// that this version of unicode-width is based on. +pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version}; +""" + ) + + module.write( + """ +pub mod charwidth { + use core::option::Option::{self, None, Some}; + + /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by + /// consulting a multi-level lookup table. + /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, + /// they're treated as single width. + /// + /// # Maintenance + /// The tables themselves are autogenerated but this function is hardcoded. You should have + /// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) + /// However, if you change the *actual structure* of the lookup tables (perhaps by editing the + /// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes. + #[inline] + fn lookup_width(c: char, is_cjk: bool) -> usize { + let cp = c as usize; + + let t1_offset = TABLES_0[cp >> 13 & 0xFF]; + + // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte, + // so each sub-table is 128 bytes in size. + // (Sub-tables are selected using the computed offset from the previous table.) + let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)]; + + // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits. + // This is accomplished by packing four stored entries into one byte. + // So each sub-table is 2**(6-2) == 16 bytes in size. + // Since this is the last table, each entry represents an encoded width. + let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)]; + + // Extract the packed width + let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; + + // A width of 3 signifies that the codepoint is ambiguous width. + if width == 3 { + if is_cjk { + 2 + } else { + 1 + } + } else { + width.into() + } + } +""" + ) + + module.write( + """ + /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or + /// `None` if `c` is a control character other than `'\\x00'`. + /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, + /// they're treated as single width. + #[inline] + pub fn width(c: char, is_cjk: bool) -> Option<usize> { + if c < '\\u{7F}' { + if c >= '\\u{20}' { + // U+0020 to U+007F (exclusive) are single-width ASCII codepoints + Some(1) + } else if c == '\\0' { + // U+0000 *is* a control code, but it's special-cased + Some(0) + } else { + // U+0001 to U+0020 (exclusive) are control codes + None + } + } else if c >= '\\u{A0}' { + // No characters >= U+00A0 are control codes, so we can consult the lookup tables + Some(lookup_width(c, is_cjk)) + } else { + // U+007F to U+00A0 (exclusive) are control codes + None + } + } +""" + ) + + subtable_count = 1 + for (i, table) in enumerate(tables): + new_subtable_count = len(table.buckets()) + if i == len(tables) - 1: + table.indices_to_widths() # for the last table, indices == widths + byte_array = table.to_bytes() + module.write( + f""" + /// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info. + static TABLES_{i}: [u8; {len(byte_array)}] = [""" + ) + for (j, byte) in enumerate(byte_array): + # Add line breaks for every 15th entry (chosen to match what rustfmt does) + if j % 15 == 0: + module.write("\n ") + module.write(f" 0x{byte:02X},") + module.write("\n ];\n") + subtable_count = new_subtable_count + module.write("}\n") + + +def main(module_filename: str): + """Obtain character data from the latest version of Unicode, transform it into a multi-level + lookup table for character width, and write a Rust module utilizing that table to + `module_filename`. + + We obey the following rules in decreasing order of importance: + - The soft hyphen (`U+00AD`) is single-width. + - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width. + - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width. + - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. + - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. + - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width + of `Neutral`, `Narrow`, or `Halfwidth`) are single-width. + + These rules are based off of Markus Kuhn's free `wcwidth()` implementation: + http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c""" + version = load_unicode_version() + print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") + + eaw_map = load_east_asian_widths() + zw_map = load_zero_widths() + + # Characters marked as zero-width in zw_map should be zero-width in the final map + width_map = list( + map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map)) + ) + + # Override for soft hyphen + width_map[0x00AD] = EffectiveWidth.NARROW + + # Override for Hangul Jamo medial vowels & final consonants + for i in range(0x1160, 0x11FF + 1): + width_map[i] = EffectiveWidth.ZERO + + tables = make_tables(TABLE_CFGS, enumerate(width_map)) + + print("------------------------") + total_size = 0 + for (i, table) in enumerate(tables): + size_bytes = len(table.to_bytes()) + print(f"Table {i} Size: {size_bytes} bytes") + total_size += size_bytes + print("------------------------") + print(f" Total Size: {total_size} bytes") + + emit_module(module_filename, version, tables) + print(f'Wrote to "{module_filename}"') + + +if __name__ == "__main__": + main(MODULE_FILENAME) |