Du1.t
Jump to navigation
Jump to search
Du1.t is PHP routine that reveals encoding of the Utf8 characters. It is upgrade of "du.t".
Code
<?php
function unichr($dec) {
if ($dec < 128) {
$utf = chr($dec);
} else if ($dec < 2048) {
$utf = chr(192 + (($dec - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
} else {
$utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
$utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
}
return $utf;
}
// include "unichr.t";
function uniord($a)
{ $M=strlen($a);
$p=ord($a[0]); if($M==1) return $p;
$p-=194; $p*=64; $p+=ord($a[1]); if($M==2) return $p;
$p-=2050; $p*=64; $p+=ord($a[2]); return $p;
# if($M==1) return ord($a[0]);
# if($M==2) return 64*(ord($a[0])-194)+ord($a[1]);
# if($M==3) return 64*( 64*(ord($a[0])-194)+ord($a[1]))-131200+ord($a[2]);
}
/* Recovery of number of the Utf8 character encoded with 1,2 or 3 bytes.
Input: string, that consists of single utf8 character.
output: number of this character in the utf8 encoding table, see [[Utf8table]] */
//include "uniord.t";
function mb_str_split($str) {
// split multibyte string in characters
// Split at all positions, not after the start: ^
// and not before the end: $
$pattern = '/(?<!^)(?!$)/u';
return preg_split($pattern,$str);
}
//include "mb_str_split.t";
//dump.t analyses the content of a sttring.
//The string is interpreted as sequense of Utf8 characters
// files unichr.t, uniord.t, mb_str_split.t
// should be loaded in the working directory. Usage:
// php dump.t "any абракадабра and だからも in any language(s)"
$a=$argv[1];
echo "$a\n";
$N=strlen($a);
echo "The array has $N bytes; here is its splitting:\n";
for($n=0;$n<$N;$n++){printf("%02x ",ord($a[$n]) );}
echo "\n";
$b = mb_str_split($a);
var_dump($b);
$M=count($b);
for($m=0;$m<$M;$m++)
{
printf("\n");
$c=$b[$m];
$u=uniord($c);
printf("\n[[X%04X]] [[&#X%04X;]] is [[Unicode]] character number %05d\n",$u,$u,$u);
$d=strlen($c);
echo "[[$c]] uses $d bytes: ";
for($n=0;$n<$d;$n++) printf("%s%2X",'%',ord($c[$n]));
printf("\n");
for($n=0;$n<$d;$n++) printf("%3d ",ord($c[$n]));
printf("in the decimal representation\n");
}
?>
References
Keywords
dump.t, KanjiLiberal, KanjiRadical, mb_str_split.t, PHP, Utf8, UtfH, unichr.t, Unicode, uniord.t