Du1.t

From TORI
Jump to: navigation, search

Du1.t is PHP routine that reveals encoding of the Utf8 characters. It is upgrade of "du.t".

Code

<?php
 function unichr($dec) {
  if ($dec < 128) {
    $utf = chr($dec);
  } else if ($dec < 2048) {
    $utf = chr(192 + (($dec - ($dec % 64)) / 64));
    $utf .= chr(128 + ($dec % 64));
  } else {
    $utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
    $utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
    $utf .= chr(128 + ($dec % 64));
  }
  return $utf;
} 
// include "unichr.t";

 function uniord($a) 
 { $M=strlen($a);
   $p=ord($a[0]);                    if($M==1) return $p;
   $p-=194;  $p*=64; $p+=ord($a[1]); if($M==2) return $p;
   $p-=2050; $p*=64; $p+=ord($a[2]);           return $p;

#   if($M==1) return ord($a[0]);
#   if($M==2) return 64*(ord($a[0])-194)+ord($a[1]);
#   if($M==3) return 64*( 64*(ord($a[0])-194)+ord($a[1]))-131200+ord($a[2]);
 }
/* Recovery of number of the Utf8 character encoded with 1,2 or 3 bytes.
Input: string, that consists of single utf8 character.
output: number of this character in the utf8 encoding table, see [[Utf8table]] */
//include "uniord.t";

function mb_str_split($str) {
  // split multibyte string in characters
  // Split at all positions, not after the start: ^
  // and not before the end: $
  $pattern = '/(?<!^)(?!$)/u';
  return preg_split($pattern,$str);
}
//include "mb_str_split.t";

//dump.t analyses the content of a sttring.
//The string is interpreted as sequense of Utf8 characters
// files unichr.t, uniord.t, mb_str_split.t
// should be loaded in the working directory.  Usage:
// php dump.t "any абракадабра and だからも in any language(s)"

$a=$argv[1];
echo "$a\n";
$N=strlen($a);
echo "The array has $N bytes; here is its splitting:\n";

for($n=0;$n<$N;$n++){printf("%02x ",ord($a[$n]) );}
echo "\n";
$b = mb_str_split($a);
var_dump($b);
$M=count($b);
for($m=0;$m<$M;$m++)
{
printf("\n");
$c=$b[$m];
$u=uniord($c);
printf("\n[[X%04X]] [[&#X%04X;]] is [[Unicode]] character number %05d\n",$u,$u,$u);
$d=strlen($c);
echo "[[$c]]  uses $d bytes: ";
for($n=0;$n<$d;$n++) printf("%s%2X",'%',ord($c[$n]));
printf("\n");
for($n=0;$n<$d;$n++) printf("%3d ",ord($c[$n]));
printf("in the decimal representation\n");
}
?>

References


Keywords

dump.t, KanjiLiberal, KanjiRadical, mb_str_split.t, PHP, Utf8, UtfH, unichr.t, Unicode, uniord.t