#!/store/bin/perl5 -w # # Author: Petter Reinholdtsen # Date: 1998-08-31 # # Check GNU libc charmaps for consistent symbolic naming and Unicode # encoding in comments. # # I run it like this: 'charmaps-check ISO_10646 ISO-IR-197' to # check if ISO-IR-197 use the same names as ISO-10646 (Unicode) use strict; use vars qw($filename $error %unicodes %codes $code_set_name $comment_char $escape_char); $error = 0; while ($filename = shift) { # *** Default values *** $escape_char = '\\'; $comment_char = '#'; open(CHARMAP, "<$filename") || warn "Unable to open $filename"; while () { if (/\s*(.*)$/) { $code_set_name = $1; next; } if (/\s*(\S+)/) { $comment_char = $1; next; } if (/\s*(\S+)/) { $escape_char = $1; next; } parse_charmap() if (/^CHARMAP/); } close CHARMAP; } exit 1 if ($error); # Convert hex, octal and decimal string values to numbers # Must handle /x0f => 15 /x0f/x0f => 3855 sub parse_encoding { my $input = shift; my $base = 1; my $value = 0; while ( $input ) { $value *= $base; if ( $input =~ m/${escape_char}x([0-9a-fA-F]{2})/ ) { my $hex = $1; my $number = oct("0x".$hex); $value += $number; $input =~ s/${escape_char}x$hex//; # Remove this part of the string $base = 16*16; } } return $value; } # Receive param with this format, and return them as array # /x0E SHIFT OUT (SO) # @retval = ('SO', 14, ' SHIFT OUT'); sub parse_charmap_line { my $line = shift; # Skip comments return if ($comment_char && /^%comment_char/); # Skip blank lines return if ($line =~ /^\s*$/); my ($code, $encoding, $comment) = $line =~ m/^<(\S+)>\s+(\S+)\s+(.+)$/; $encoding = parse_encoding($encoding); return ($code, $encoding, $comment); } sub parse_charmap { while () { if (/^END CHARMAP$/) { $code_set_name = "[unknown]"; return; } chomp; my $line = $_; my ($code, $encoding, $comment) = parse_charmap_line($line); my ($unicode, $name); print "Unable to parse line: $line\n" if ( ! $comment ); if ( $comment =~ m/\s+(.*)/ ) { ($unicode, $name) = (oct("0x".$1), $2); } else { $unicode = $encoding; $name = $comment; } if ($code) { # ************ Check UNICODE value<->name ***************** # Don't test U0000, as it is named both 'NULL' and 'NUL' if (0 != $unicode && exists $unicodes{$unicode}) { unless ($unicodes{$unicode} eq $name) { print "$filename\[$code_set_name\]: Same unicode ($unicode) but different name\n \"$name\"\n != \"$unicodes{$unicode}\"\n"; $error = 1; } } else { $unicodes{$unicode} = $name; } # *********** Check UNICODE value<->code ***************** # XXX Hm, don't seem to work right. Don't handle multiple # codes for the same name if (exists $codes{$code}) { unless ($codes{$code} eq $unicode) { print "$filename\[$code_set_name\]: Same code ($code) but different unicode \"$unicode\" != \"$codes{$code}\"\n"; $error = 1; } } else { $codes{$code} = $unicode; } } } }