mirror of
https://github.com/rn10950/RetroZilla.git
synced 2024-11-09 09:20:15 +01:00
229 lines
5.5 KiB
Perl
229 lines
5.5 KiB
Perl
#!/usr/local/bin/perl
|
|
use strict;
|
|
require "genverifier.pm";
|
|
use genverifier;
|
|
|
|
|
|
my(@utf8_cls);
|
|
my(@utf8_st);
|
|
my($utf8_ver);
|
|
|
|
#
|
|
#
|
|
# UTF8 encode the UCS4 into 1 to 6 bytes
|
|
#
|
|
# 1 byte 00 00 00 00 00 00 00 7f
|
|
# 2 bytes 00 00 00 80 00 00 07 ff
|
|
# 3 bytes 00 00 08 00 00 00 ff ff
|
|
# 4 bytes 00 01 00 00 00 1f ff ff
|
|
# 5 bytes 00 20 00 00 03 ff ff ff
|
|
# 6 bytes 04 00 00 00 7f ff ff ff
|
|
#
|
|
# Howerver, since Surrogate area should not be encoded into UTF8 as
|
|
# a Surrogate pair, we can remove the surrogate area from UTF8
|
|
#
|
|
# 1 byte 00 00 00 00 00 00 00 7f
|
|
# 2 bytes 00 00 00 80 00 00 07 ff
|
|
# 3 bytes 00 00 08 00 00 00 d7 ff
|
|
# 00 00 e0 00 00 00 ff ff
|
|
# 4 bytes 00 01 00 00 00 1f ff ff
|
|
# 5 bytes 00 20 00 00 03 ff ff ff
|
|
# 6 bytes 04 00 00 00 7f ff ff ff
|
|
#
|
|
# Now we break them into 6 bits group for 2-6 bytes UTF8
|
|
#
|
|
# 1 byte 00 7f
|
|
# 2 bytes 02 00 1f 3f
|
|
# 3 bytes 00 20 00 0d 1f 3f
|
|
# 0e 00 00 0f 3f 3f
|
|
# 4 bytes 00 20 00 00 07 3f 3f 3f
|
|
# 5 bytes 00 08 00 00 00 03 3f 3f 3f 3f
|
|
# 6 bytes 00 04 00 00 00 00 01 3f 3f 3f 3f 3f
|
|
#
|
|
# Break down more
|
|
#
|
|
# 1 byte 00 7f
|
|
# 2 bytes 02 00 1f 3f
|
|
# 3 bytes 00 20 00 00 3f 3f
|
|
# 01 00 00 0c 3f 3f
|
|
# 0d 00 00 0d 1f 3f
|
|
# 0e 00 00 0f 3f 3f
|
|
# 4 bytes 00 20 00 00 00 3f 3f 3f
|
|
# 01 00 00 00 07 3f 3f 3f
|
|
# 5 bytes 00 08 00 00 00 00 3f 3f 3f 3f
|
|
# 01 00 00 00 00 03 3f 3f 3f 3f
|
|
# 6 bytes 00 04 00 00 00 00 00 3f 3f 3f 3f 3f
|
|
# 01 00 00 00 00 00 01 3f 3f 3f 3f 3f
|
|
#
|
|
# Now, add
|
|
# c0 to the lead byte of 2 bytes UTF8
|
|
# e0 to the lead byte of 3 bytes UTF8
|
|
# f0 to the lead byte of 4 bytes UTF8
|
|
# f8 to the lead byte of 5 bytes UTF8
|
|
# fc to the lead byte of 6 bytes UTF8
|
|
# 80 to the trail bytes of 2 - 6 bytes UTF8
|
|
#
|
|
# 1 byte 00 7f
|
|
# 2 bytes c2 80 df bf
|
|
# 3 bytes e0 a0 80 e0 bf bf
|
|
# e1 80 80 ec bf bf
|
|
# ed 80 80 ed 9f bf
|
|
# ee 80 80 ef bf bf
|
|
# 4 bytes f0 a0 80 80 f0 bf bf bf
|
|
# f1 80 80 80 f7 bf bf bf
|
|
# 5 bytes f8 88 80 80 80 f8 bf bf bf bf
|
|
# f9 80 80 80 80 fb bf bf bf bf
|
|
# 6 bytes fc 84 80 80 80 80 fc bf bf bf bf bf
|
|
# fd 80 80 80 80 80 fd bf bf bf bf bf
|
|
#
|
|
#
|
|
# Now we can construct our state diagram
|
|
#
|
|
# 0:0x00,0x0e,0x0f,0x1b->Error
|
|
# 0:[0-0x7f]->0
|
|
# 0:fd->3
|
|
# 0:fc->4
|
|
# 0:[f9-fb]->5
|
|
# 0:f8->6
|
|
# 0:[f1-f7]->7
|
|
# 0:f0->8
|
|
# 0:[e1-ecee-ef]->9
|
|
# 0:e0->10
|
|
# 0:ed->11
|
|
# 0:[c2-df]->12
|
|
# 0:*->Error
|
|
# 3:[80-bf]->5
|
|
# 3:*->Error
|
|
# 4:[84-bf]->5
|
|
# 4:*->Error
|
|
# 5:[80-bf]->7
|
|
# 5:*->Error
|
|
# 6:[88-bf]->7
|
|
# 6:*->Error
|
|
# 7:[80-bf]->9
|
|
# 7:*->Error
|
|
# 8:[a0-bf]->9
|
|
# 8:*->Error
|
|
# 9:[80-bf]->12
|
|
# 9:*->Error
|
|
# 10:[a0-bf]->12
|
|
# 10:*->Error
|
|
# 11:[80-9f]->12
|
|
# 11:*->Error
|
|
# 12:[80-bf]->0
|
|
# 12:*->Error
|
|
#
|
|
# Now, we classified chars into class
|
|
#
|
|
# 00,0e,0f,1b:k0
|
|
# 01-0d,10-1a,1c-7f:k1
|
|
# 80-83:k2
|
|
# 84-87:k3
|
|
# 88-9f:k4
|
|
# a0-bf:k5
|
|
# c0-c1:k0
|
|
# c2-df:k6
|
|
# e0:k7
|
|
# e1-ec:k8
|
|
# ed:k9
|
|
# ee-ef:k8
|
|
# f0:k10
|
|
# f1-f7:k11
|
|
# f8:k12
|
|
# f9-fb:k13
|
|
# fc:k14
|
|
# fd:k15
|
|
# fe-ff:k0
|
|
#
|
|
# Now, let's put them into array form
|
|
|
|
@utf8_cls = (
|
|
[ 0x00 , 0x00 , 1 ],
|
|
[ 0x0e , 0x0f , 0 ],
|
|
[ 0x1b , 0x1b , 0 ],
|
|
[ 0x01 , 0x0d , 1 ],
|
|
[ 0x10 , 0x1a , 1 ],
|
|
[ 0x1c , 0x7f , 1 ],
|
|
[ 0x80 , 0x83 , 2 ],
|
|
[ 0x84 , 0x87 , 3 ],
|
|
[ 0x88 , 0x9f , 4 ],
|
|
[ 0xa0 , 0xbf , 5 ],
|
|
[ 0xc0 , 0xc1 , 0 ],
|
|
[ 0xc2 , 0xdf , 6 ],
|
|
[ 0xe0 , 0xe0 , 7 ],
|
|
[ 0xe1 , 0xec , 8 ],
|
|
[ 0xed , 0xed , 9 ],
|
|
[ 0xee , 0xef , 8 ],
|
|
[ 0xf0 , 0xf0 , 10 ],
|
|
[ 0xf1 , 0xf7 , 11 ],
|
|
[ 0xf8 , 0xf8 , 12 ],
|
|
[ 0xf9 , 0xfb , 13 ],
|
|
[ 0xfc , 0xfc , 14 ],
|
|
[ 0xfd , 0xfd , 15 ],
|
|
[ 0xfe , 0xff , 0 ],
|
|
);
|
|
#
|
|
# Now, we write the state diagram in class
|
|
#
|
|
# 0:k0->Error
|
|
# 0:k1->0
|
|
# 0:k15->3
|
|
# 0:k14->4
|
|
# 0:k13->5
|
|
# 0:k12->6
|
|
# 0:k11->7
|
|
# 0:k10->8
|
|
# 0:k8->9
|
|
# 0:k7->10
|
|
# 0:k9->11
|
|
# 0:k6->12
|
|
# 0:*->Error
|
|
# 3:k2,k3,k4,k5->5
|
|
# 3:*->Error
|
|
# 4:k3,k4,k5->5
|
|
# 4:*->Error
|
|
# 5:k2,k3,k4,k5->7
|
|
# 5:*->Error
|
|
# 6:k4,k5->7
|
|
# 6:*->Error
|
|
# 7:k2,k3,k4,k5->9
|
|
# 7:*->Error
|
|
# 8:k5->9
|
|
# 8:*->Error
|
|
# 9:k2,k3,k4,k5->12
|
|
# 9:*->Error
|
|
# 10:k5->12
|
|
# 10:*->Error
|
|
# 11:k2,k3,k4->12
|
|
# 11:*->Error
|
|
# 12:k2,k3,k4,k5->0
|
|
# 12:*->Error
|
|
#
|
|
# Now, let's put them into array
|
|
#
|
|
package genverifier;
|
|
@utf8_st = (
|
|
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
|
1, 0, 1, 1, 1, 1,12,10, 9,11, 8, 7, 6, 5, 4, 3, # state 0 Start
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe
|
|
1, 1, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 3
|
|
1, 1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 4
|
|
1, 1, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 5
|
|
1, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 6
|
|
1, 1, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 7
|
|
1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 8
|
|
1, 1,12,12,12,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9
|
|
1, 1, 1, 1, 1,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 10
|
|
1, 1,12,12,12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 11
|
|
1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 12
|
|
);
|
|
|
|
|
|
|
|
$utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 16, \@utf8_st);
|
|
print $utf8_ver;
|
|
|
|
|
|
|