|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +This is a small script to make an inquiry into the version history of unicode data tables, and to |
| 4 | +validate conflicts in the tables as they are published: |
| 5 | +
|
| 6 | +- check for individual code point definitions change in in subsequent releases, |
| 7 | + these should be considered before attempting to reduce the size of our versioned |
| 8 | + tables without a careful incremental change description. Each "violation" is |
| 9 | + logged as INFO. |
| 10 | +- check that a codepoint in the 'zero' table is not present in the 'wide' table |
| 11 | + and vice versa. This is logged as ERROR and causes program to exit 1. |
| 12 | +
|
| 13 | +Some examples of the first kind, |
| 14 | +
|
| 15 | +1. |
| 16 | +
|
| 17 | + value 0x1f93b in table WIDE_EASTASIAN version 12.1.0 is not defined in 13.0.0 from range ('0x1f90d', '0x1f971') |
| 18 | + value 0x1f946 in table WIDE_EASTASIAN version 12.1.0 is not defined in 13.0.0 from range ('0x1f90d', '0x1f971') |
| 19 | +
|
| 20 | +two characters were changed from 'W' to 'N': |
| 21 | +
|
| 22 | + -EastAsianWidth-12.0.0.txt:1F90D..1F971;W # So [101] WHITE HEART..YAWNING FACE |
| 23 | + +EastAsianWidth-12.1.0.txt:1F90C..1F93A;W # So [47] PINCHED FINGERS..FENCER |
| 24 | + +EastAsianWidth-12.1.0.txt:1F93B;N # So MODERN PENTATHLON |
| 25 | + +EastAsianWidth-12.1.0.txt:1F93C..1F945;W # So [10] WRESTLERS..GOAL NET |
| 26 | + +EastAsianWidth-12.1.0.txt:1F946;N # So RIFLE |
| 27 | + +EastAsianWidth-12.1.0.txt:1F947..1F978;W # So [50] FIRST PLACE MEDAL..DISGUISED FACE |
| 28 | +
|
| 29 | +As well as for output, |
| 30 | +
|
| 31 | + value 0x11a3 in table WIDE_EASTASIAN version 6.1.0 is not defined in 6.2.0 from range ('0x11a3', '0x11a7') |
| 32 | + ... |
| 33 | + value 0x11fe in table WIDE_EASTASIAN version 6.1.0 is not defined in 6.2.0 from range ('0x11fa', '0x11ff') |
| 34 | +
|
| 35 | +Category code was changed from 'W' to 'N': |
| 36 | +
|
| 37 | + -EastAsianWidth-6.1.0.txt:11A3;W # HANGUL JUNGSEONG A-EU |
| 38 | + +EastAsianWidth-6.2.0.txt:11A3;N # HANGUL JUNGSEONG A-EU |
| 39 | +
|
| 40 | +
|
| 41 | +2. |
| 42 | +
|
| 43 | + value 0x1cf2 in table ZERO_WIDTH version 11.0.0 is not defined in 12.0.0 from range ('0x1cf2', '0x1cf4') |
| 44 | + value 0x1cf3 in table ZERO_WIDTH version 11.0.0 is not defined in 12.0.0 from range ('0x1cf2', '0x1cf4') |
| 45 | +
|
| 46 | +Category code was changed from 'Mc' to 'Lo': |
| 47 | +
|
| 48 | + -DerivedGeneralCategory-11.0.0.txt:1CF2..1CF3 ; Mc # [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA |
| 49 | + +DerivedGeneralCategory-12.0.0.txt:1CEE..1CF3 ; Lo # [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA |
| 50 | +
|
| 51 | +As well as for output, |
| 52 | +
|
| 53 | + value 0x19b0 in table ZERO_WIDTH version 7.0.0 is not defined in 8.0.0 from range ('0x19b0', '0x19c0') |
| 54 | + ... |
| 55 | + value 0x19c8 in table ZERO_WIDTH version 7.0.0 is not defined in 8.0.0 from range ('0x19c8', '0x19c9') |
| 56 | +
|
| 57 | +Category code was changed from 'Mc' to 'Lo': |
| 58 | +
|
| 59 | + -DerivedGeneralCategory-7.0.0.txt:19B0..19C0 ; Mc # [17] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE VOWEL SIGN IY |
| 60 | + +DerivedGeneralCategory-8.0.0.txt:19B0..19C9 ; Lo # [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 |
| 61 | +""" |
| 62 | +# std imports |
| 63 | +import logging |
| 64 | + |
| 65 | + |
| 66 | +def main(log: logging.Logger): |
| 67 | + # local |
| 68 | + from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions |
| 69 | + reversed_uni_versions = list(reversed(list_versions())) |
| 70 | + tables = {'ZERO_WIDTH': ZERO_WIDTH, |
| 71 | + 'WIDE_EASTASIAN': WIDE_EASTASIAN} |
| 72 | + errors = 0 |
| 73 | + for idx, version in enumerate(reversed_uni_versions): |
| 74 | + if idx == 0: |
| 75 | + continue |
| 76 | + next_version = reversed_uni_versions[idx - 1] |
| 77 | + for table_name, table in tables.items(): |
| 78 | + next_table = table[next_version] |
| 79 | + curr_table = table[version] |
| 80 | + other_table_name = 'WIDE_EASTASIAN' if table_name == 'ZERO_WIDTH' else 'ZERO_WIDTH' |
| 81 | + other_table = tables[other_table_name][version] |
| 82 | + for start_range, stop_range in curr_table: |
| 83 | + for unichar_n in range(start_range, stop_range): |
| 84 | + if not _bisearch(unichar_n, next_table): |
| 85 | + log.info(f'value {hex(unichar_n)} in table_name={table_name}' |
| 86 | + f' version={version} is not defined in next_version={next_version}' |
| 87 | + f' from inclusive range {hex(start_range)}-{hex(stop_range)}') |
| 88 | + if _bisearch(unichar_n, other_table): |
| 89 | + log.error(f'value {hex(unichar_n)} in table_name={table_name}' |
| 90 | + f' version={version} is duplicated in other_table_name={other_table_name}' |
| 91 | + f' from inclusive range {hex(start_range)}-{hex(stop_range)}') |
| 92 | + errors += 1 |
| 93 | + if errors: |
| 94 | + log.error(f'{errors} errors, exit 1') |
| 95 | + exit(1) |
| 96 | + |
| 97 | + |
| 98 | +if __name__ == '__main__': |
| 99 | + _logfmt = '%(levelname)s %(filename)s:%(lineno)d %(message)s' |
| 100 | + logging.basicConfig(level="INFO", format=_logfmt, force=True) |
| 101 | + log = logging.getLogger() |
| 102 | + main(log) |
0 commit comments