Du1.t
Jump to navigation
Jump to search
Du1.t is PHP routine that reveals encoding of the Utf8 characters. It is upgrade of "du.t".
Code
<?php function unichr($dec) { if ($dec < 128) { $utf = chr($dec); } else if ($dec < 2048) { $utf = chr(192 + (($dec - ($dec % 64)) / 64)); $utf .= chr(128 + ($dec % 64)); } else { $utf = chr(224 + (($dec - ($dec % 4096)) / 4096)); $utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64)); $utf .= chr(128 + ($dec % 64)); } return $utf; } // include "unichr.t"; function uniord($a) { $M=strlen($a); $p=ord($a[0]); if($M==1) return $p; $p-=194; $p*=64; $p+=ord($a[1]); if($M==2) return $p; $p-=2050; $p*=64; $p+=ord($a[2]); return $p; # if($M==1) return ord($a[0]); # if($M==2) return 64*(ord($a[0])-194)+ord($a[1]); # if($M==3) return 64*( 64*(ord($a[0])-194)+ord($a[1]))-131200+ord($a[2]); } /* Recovery of number of the Utf8 character encoded with 1,2 or 3 bytes. Input: string, that consists of single utf8 character. output: number of this character in the utf8 encoding table, see [[Utf8table]] */ //include "uniord.t"; function mb_str_split($str) { // split multibyte string in characters // Split at all positions, not after the start: ^ // and not before the end: $ $pattern = '/(?<!^)(?!$)/u'; return preg_split($pattern,$str); } //include "mb_str_split.t"; //dump.t analyses the content of a sttring. //The string is interpreted as sequense of Utf8 characters // files unichr.t, uniord.t, mb_str_split.t // should be loaded in the working directory. Usage: // php dump.t "any абракадабра and だからも in any language(s)" $a=$argv[1]; echo "$a\n"; $N=strlen($a); echo "The array has $N bytes; here is its splitting:\n"; for($n=0;$n<$N;$n++){printf("%02x ",ord($a[$n]) );} echo "\n"; $b = mb_str_split($a); var_dump($b); $M=count($b); for($m=0;$m<$M;$m++) { printf("\n"); $c=$b[$m]; $u=uniord($c); printf("\n[[X%04X]] [[&#X%04X;]] is [[Unicode]] character number %05d\n",$u,$u,$u); $d=strlen($c); echo "[[$c]] uses $d bytes: "; for($n=0;$n<$d;$n++) printf("%s%2X",'%',ord($c[$n])); printf("\n"); for($n=0;$n<$d;$n++) printf("%3d ",ord($c[$n])); printf("in the decimal representation\n"); } ?>
References
Keywords
dump.t, KanjiLiberal, KanjiRadical, mb_str_split.t, PHP, Utf8, UtfH, unichr.t, Unicode, uniord.t