144 lines
3.1 KiB
Tcl
144 lines
3.1 KiB
Tcl
#!/usr/bin/env tclsh
|
|
|
|
# Generate UTF-8 case mapping tables
|
|
#
|
|
# (c) 2010 Steve Bennett <steveb@workware.net.au>
|
|
#
|
|
# See LICENCE for licence details.
|
|
#/
|
|
|
|
# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
|
|
# and http://unicode.org/Public/UNIDATA/EastAsianWidth.txt
|
|
# to generate case mapping and display width tables
|
|
set map(lower) {}
|
|
set map(upper) {}
|
|
set map(title) {}
|
|
set map(combining) {}
|
|
set map(wide) {}
|
|
|
|
set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt \[EastAsianWidth.txt\]"
|
|
set do_width 0
|
|
|
|
if {[lindex $argv 0] eq "-width"} {
|
|
set do_width 1
|
|
set argv [lrange $argv 1 end]
|
|
}
|
|
|
|
if {[llength $argv] ni {1 2}} {
|
|
puts stderr $USAGE
|
|
exit 1
|
|
}
|
|
|
|
lassign $argv unicodefile widthfile
|
|
|
|
set f [open $unicodefile]
|
|
while {[gets $f buf] >= 0} {
|
|
# Remove any trailing whitespace, especially errant CR
|
|
set buf [string trim $buf]
|
|
set title ""
|
|
set lower ""
|
|
set upper ""
|
|
lassign [split $buf ";"] code name class x x x x x x x x x upper lower title
|
|
set codex [string tolower 0x$code]
|
|
if {[string match M* $class]} {
|
|
if {![info exists combining]} {
|
|
set combining $codex
|
|
}
|
|
continue
|
|
} elseif {[info exists combining]} {
|
|
lappend map(combining) $combining $codex
|
|
unset combining
|
|
}
|
|
if {$codex <= 0x7f} {
|
|
continue
|
|
}
|
|
if {$codex > 0xffff} {
|
|
break
|
|
}
|
|
if {![string match L* $class]} {
|
|
continue
|
|
}
|
|
if {$upper ne ""} {
|
|
lappend map(upper) $codex [string tolower 0x$upper]
|
|
}
|
|
if {$lower ne ""} {
|
|
lappend map(lower) $codex [string tolower 0x$lower]
|
|
}
|
|
if {$title ne "" && $title ne $upper} {
|
|
if {$title eq $code} {
|
|
set title 0
|
|
}
|
|
lappend map(title) $codex [string tolower 0x$title]
|
|
}
|
|
}
|
|
close $f
|
|
|
|
proc output-int-pairs {list} {
|
|
set n 0
|
|
foreach {v1 v2} $list {
|
|
puts -nonewline "\t{ $v1, $v2 },"
|
|
if {[incr n] % 4 == 0} {
|
|
puts ""
|
|
}
|
|
}
|
|
if {$n % 4} {
|
|
puts ""
|
|
}
|
|
}
|
|
|
|
# Merges adjacent ranges in a list of ranges (lower upper lower upper ...)
|
|
proc combine-adjacent-ranges {list} {
|
|
set newlist {}
|
|
foreach {lower upper} $list {
|
|
if {[info exists prev_upper]} {
|
|
if {$lower == $prev_upper + 1} {
|
|
# combine these
|
|
set prev_upper $upper
|
|
continue
|
|
} else {
|
|
# can't combine
|
|
lappend newlist $prev_lower $prev_upper
|
|
}
|
|
}
|
|
set prev_lower $lower
|
|
set prev_upper $upper
|
|
}
|
|
# Now add the last range
|
|
lappend newlist $prev_lower $prev_upper
|
|
return $newlist
|
|
}
|
|
|
|
foreach type {upper lower title} {
|
|
puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
|
|
output-int-pairs $map($type)
|
|
puts "\};\n"
|
|
}
|
|
|
|
if {$do_width} {
|
|
set f [open $widthfile]
|
|
while {[gets $f buf] >= 0} {
|
|
# Remove any trailing whitespace, especially errant CR
|
|
set buf [string trim $buf]
|
|
if {[regexp {^([0-9A-Fa-f.]+);W} $buf -> range]} {
|
|
set range [string tolower $range]
|
|
lassign [split $range .] lower - upper
|
|
if {$upper eq ""} {
|
|
set upper $lower
|
|
}
|
|
lappend map(wide) 0x$lower 0x$upper
|
|
}
|
|
}
|
|
close $f
|
|
}
|
|
|
|
foreach type {combining wide} {
|
|
puts "static const struct utf8range unicode_range_$type\[\] = \{"
|
|
if {$do_width} {
|
|
output-int-pairs [combine-adjacent-ranges $map($type)]
|
|
} else {
|
|
# Just produce empty width tables in this case
|
|
output-int-pairs {}
|
|
}
|
|
puts "\};\n"
|
|
}
|