| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
|
|---|
| 5 |
=head1 NAME |
|---|
| 6 |
|
|---|
| 7 |
Jcode - Japanese Charset Handler |
|---|
| 8 |
|
|---|
| 9 |
=head1 SYNOPSIS |
|---|
| 10 |
|
|---|
| 11 |
use Jcode; |
|---|
| 12 |
|
|---|
| 13 |
|
|---|
| 14 |
Jcode::convert(\$str, $ocode, $icode, "z"); |
|---|
| 15 |
|
|---|
| 16 |
print Jcode->new($str)->h2z->tr($from, $to)->utf8; |
|---|
| 17 |
|
|---|
| 18 |
=cut |
|---|
| 19 |
|
|---|
| 20 |
=head1 DESCRIPTION |
|---|
| 21 |
|
|---|
| 22 |
Jcode.pm supports both object and traditional approach. |
|---|
| 23 |
With object approach, you can go like; |
|---|
| 24 |
|
|---|
| 25 |
$iso_2022_jp = Jcode->new($str)->h2z->jis; |
|---|
| 26 |
|
|---|
| 27 |
Which is more elegant than; |
|---|
| 28 |
|
|---|
| 29 |
$iso_2022_jp = &jcode::convert(\$str,'jis',jcode::getcode(\str), "z"); |
|---|
| 30 |
|
|---|
| 31 |
For those unfamiliar with objects, Jcode.pm still supports getcode() |
|---|
| 32 |
and convert(). |
|---|
| 33 |
|
|---|
| 34 |
=cut |
|---|
| 35 |
|
|---|
| 36 |
package Jcode; |
|---|
| 37 |
use 5.004; |
|---|
| 38 |
use Carp; |
|---|
| 39 |
use strict; |
|---|
| 40 |
use vars qw($RCSID $VERSION $DEBUG); |
|---|
| 41 |
|
|---|
| 42 |
$RCSID = q$Id: Jcode.pm,v 0.83 2003/03/16 16:15:34 dankogai Exp dankogai $; |
|---|
| 43 |
$VERSION = do { my @r = (q$Revision: 0.83 $ =~ /\d+/g); sprintf "%d."."%02d" x $ |
|---|
| 44 |
$DEBUG = 0; |
|---|
| 45 |
|
|---|
| 46 |
use Exporter; |
|---|
| 47 |
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); |
|---|
| 48 |
@ISA = qw(Exporter); |
|---|
| 49 |
@EXPORT = qw(jcode getcode); |
|---|
| 50 |
@EXPORT_OK = qw($RCSID $VERSION $DEBUG); |
|---|
| 51 |
%EXPORT_TAGS = ( all => [ @EXPORT_OK, @EXPORT ] ); |
|---|
| 52 |
|
|---|
| 53 |
|
|---|
| 54 |
use vars qw($USE_CACHE $NOXS); |
|---|
| 55 |
|
|---|
| 56 |
$USE_CACHE = 1; |
|---|
| 57 |
$NOXS = 0; |
|---|
| 58 |
|
|---|
| 59 |
print $RCSID, "\n" if $DEBUG; |
|---|
| 60 |
|
|---|
| 61 |
use Jcode::Constants qw(:all); |
|---|
| 62 |
|
|---|
| 63 |
use overload |
|---|
| 64 |
q("") => sub { ${$_[0]->[0]} }, |
|---|
| 65 |
q(==) => sub {overload::StrVal($_[0]) eq overload::StrVal($_[1])}, |
|---|
| 66 |
q(=) => sub { $_[0]->set( $_[1] ) }, |
|---|
| 67 |
q(.=) => sub { $_[0]->append( $_[1] ) }, |
|---|
| 68 |
fallback => 1, |
|---|
| 69 |
; |
|---|
| 70 |
|
|---|
| 71 |
=head1 Methods |
|---|
| 72 |
|
|---|
| 73 |
Methods mentioned here all return Jcode object unless otherwise mentioned. |
|---|
| 74 |
|
|---|
| 75 |
=over 4 |
|---|
| 76 |
|
|---|
| 77 |
=item $j = Jcode-E<gt>new($str [, $icode]); |
|---|
| 78 |
|
|---|
| 79 |
Creates Jcode object $j from $str. Input code is automatically checked |
|---|
| 80 |
unless you explicitly set $icode. For available charset, see L<getcode> |
|---|
| 81 |
below. |
|---|
| 82 |
|
|---|
| 83 |
The object keeps the string in EUC format enternaly. When the object |
|---|
| 84 |
itself is evaluated, it returns the EUC-converted string so you can |
|---|
| 85 |
"print $j;" without calling access method if you are using EUC |
|---|
| 86 |
(thanks to function overload). |
|---|
| 87 |
|
|---|
| 88 |
=item Passing Reference |
|---|
| 89 |
|
|---|
| 90 |
Instead of scalar value, You can use reference as |
|---|
| 91 |
|
|---|
| 92 |
Jcode->new(\$str); |
|---|
| 93 |
|
|---|
| 94 |
This saves time a little bit. In exchange of the value of $str being |
|---|
| 95 |
converted. (In a way, $str is now "tied" to jcode object). |
|---|
| 96 |
|
|---|
| 97 |
=item $j-E<gt>set($str [, $icode]); |
|---|
| 98 |
|
|---|
| 99 |
Sets $j's internal string to $str. Handy when you use Jcode object repeatedly |
|---|
| 100 |
(saves time and memory to create object). |
|---|
| 101 |
|
|---|
| 102 |
# converts mailbox to SJIS format |
|---|
| 103 |
my $jconv = new Jcode; |
|---|
| 104 |
$/ = 00; |
|---|
| 105 |
while(<>){ |
|---|
| 106 |
print $jconv->set(\$_)->mime_decode->sjis; |
|---|
| 107 |
} |
|---|
| 108 |
|
|---|
| 109 |
=item $j-E<gt>append($str [, $icode]); |
|---|
| 110 |
|
|---|
| 111 |
Appends $str to $j's internal string. |
|---|
| 112 |
|
|---|
| 113 |
=back |
|---|
| 114 |
|
|---|
| 115 |
=cut |
|---|
| 116 |
|
|---|
| 117 |
sub new { |
|---|
| 118 |
my $class = shift; |
|---|
| 119 |
my ($thingy, $icode) = @_; |
|---|
| 120 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 121 |
my $nmatch; |
|---|
| 122 |
($icode, $nmatch) = getcode($r_str) unless $icode; |
|---|
| 123 |
convert($r_str, 'euc', $icode); |
|---|
| 124 |
my $self = [ |
|---|
| 125 |
$r_str, |
|---|
| 126 |
$icode, |
|---|
| 127 |
$nmatch, |
|---|
| 128 |
]; |
|---|
| 129 |
carp "Object of class $class created" if $DEBUG >= 2; |
|---|
| 130 |
bless $self, $class; |
|---|
| 131 |
} |
|---|
| 132 |
|
|---|
| 133 |
sub r_str { $_[0]->[0] } |
|---|
| 134 |
sub icode { $_[0]->[1] } |
|---|
| 135 |
sub nmatch { $_[0]->[2] } |
|---|
| 136 |
|
|---|
| 137 |
sub set { |
|---|
| 138 |
my $self = shift; |
|---|
| 139 |
my ($thingy, $icode) = @_; |
|---|
| 140 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 141 |
my $nmatch; |
|---|
| 142 |
($icode, $nmatch) = getcode($r_str) unless $icode; |
|---|
| 143 |
convert($r_str, 'euc', $icode); |
|---|
| 144 |
$self->[0] = $r_str; |
|---|
| 145 |
$self->[1] = $icode; |
|---|
| 146 |
$self->[2] = $nmatch; |
|---|
| 147 |
return $self; |
|---|
| 148 |
} |
|---|
| 149 |
|
|---|
| 150 |
sub append { |
|---|
| 151 |
my $self = shift; |
|---|
| 152 |
my ($thingy, $icode) = @_; |
|---|
| 153 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 154 |
my $nmatch; |
|---|
| 155 |
($icode, $nmatch) = getcode($r_str) unless $icode; |
|---|
| 156 |
convert($r_str, 'euc', $icode); |
|---|
| 157 |
${$self->[0]} .= $$r_str; |
|---|
| 158 |
$self->[1] = $icode; |
|---|
| 159 |
$self->[2] = $nmatch; |
|---|
| 160 |
return $self; |
|---|
| 161 |
} |
|---|
| 162 |
|
|---|
| 163 |
=over 4 |
|---|
| 164 |
|
|---|
| 165 |
=item $j = jcode($str [, $icode]); |
|---|
| 166 |
|
|---|
| 167 |
shortcut for Jcode->new() so you can go like; |
|---|
| 168 |
|
|---|
| 169 |
$sjis = jcode($str)->sjis; |
|---|
| 170 |
|
|---|
| 171 |
=item $euc = $j-E<gt>euc; |
|---|
| 172 |
|
|---|
| 173 |
=item $jis = $j-E<gt>jis; |
|---|
| 174 |
|
|---|
| 175 |
=item $sjis = $j-E<gt>sjis; |
|---|
| 176 |
|
|---|
| 177 |
What you code is what you get :) |
|---|
| 178 |
|
|---|
| 179 |
=item $iso_2022_jp = $j-E<gt>iso_2022_jp |
|---|
| 180 |
|
|---|
| 181 |
Same as $j->z2h->jis. |
|---|
| 182 |
Hankaku Kanas are forcibly converted to Zenkaku. |
|---|
| 183 |
|
|---|
| 184 |
=back |
|---|
| 185 |
|
|---|
| 186 |
=cut |
|---|
| 187 |
|
|---|
| 188 |
sub jcode { return Jcode->new(@_) } |
|---|
| 189 |
sub euc { return ${$_[0]->[0]} } |
|---|
| 190 |
sub jis { return &euc_jis(${$_[0]->[0]})} |
|---|
| 191 |
sub sjis { return &euc_sjis(${$_[0]->[0]})} |
|---|
| 192 |
sub iso_2022_jp{return $_[0]->h2z->jis} |
|---|
| 193 |
|
|---|
| 194 |
=over 4 |
|---|
| 195 |
|
|---|
| 196 |
=item [@lines =] $jcode-E<gt>jfold([$bytes_per_line, $newline_str]); |
|---|
| 197 |
|
|---|
| 198 |
folds lines in jcode string every $bytes_per_line (default: 72) |
|---|
| 199 |
in a way that does not clobber the multibyte string. |
|---|
| 200 |
(Sorry, no Kinsoku done!) |
|---|
| 201 |
with a newline string spified by $newline_str (default: \n). |
|---|
| 202 |
|
|---|
| 203 |
=back |
|---|
| 204 |
|
|---|
| 205 |
=cut |
|---|
| 206 |
|
|---|
| 207 |
sub jfold{ |
|---|
| 208 |
my $self = shift; |
|---|
| 209 |
my ($bpl, $nl) = @_; |
|---|
| 210 |
$bpl ||= 72; |
|---|
| 211 |
$nl ||= "\n"; |
|---|
| 212 |
my $r_str = $self->[0]; |
|---|
| 213 |
my (@lines, $len, $i); |
|---|
| 214 |
while ($$r_str =~ |
|---|
| 215 |
m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo) |
|---|
| 216 |
{ |
|---|
| 217 |
if ($len + length($1) > $bpl){ |
|---|
| 218 |
$i++; |
|---|
| 219 |
$len = 0; |
|---|
| 220 |
} |
|---|
| 221 |
$lines[$i] .= $1; |
|---|
| 222 |
$len += length($1); |
|---|
| 223 |
} |
|---|
| 224 |
defined($lines[$i]) or pop @lines; |
|---|
| 225 |
$$r_str = join($nl, @lines); |
|---|
| 226 |
return wantarray ? @lines : $self; |
|---|
| 227 |
} |
|---|
| 228 |
|
|---|
| 229 |
=pod |
|---|
| 230 |
|
|---|
| 231 |
=over 4 |
|---|
| 232 |
|
|---|
| 233 |
=item $length = $jcode-E<gt>jlength(); |
|---|
| 234 |
|
|---|
| 235 |
returns character length properly, rather than byte length. |
|---|
| 236 |
|
|---|
| 237 |
=back |
|---|
| 238 |
|
|---|
| 239 |
=cut |
|---|
| 240 |
|
|---|
| 241 |
sub jlength { |
|---|
| 242 |
my $self = shift; |
|---|
| 243 |
my $r_str = $self->[0]; |
|---|
| 244 |
return scalar (my @char = $$r_str =~ m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo); |
|---|
| 245 |
} |
|---|
| 246 |
|
|---|
| 247 |
=head2 Methods that use MIME::Base64 |
|---|
| 248 |
|
|---|
| 249 |
To use methods below, you need MIME::Base64. To install, simply |
|---|
| 250 |
|
|---|
| 251 |
perl -MCPAN -e 'CPAN::Shell->install("MIME::Base64")' |
|---|
| 252 |
|
|---|
| 253 |
=over 4 |
|---|
| 254 |
|
|---|
| 255 |
=item $mime_header = $j-E<gt>mime_encode([$lf, $bpl]); |
|---|
| 256 |
|
|---|
| 257 |
Converts $str to MIME-Header documented in RFC1522. |
|---|
| 258 |
When $lf is specified, it uses $lf to fold line (default: \n). |
|---|
| 259 |
When $bpl is specified, it uses $bpl for the number of bytes (default: 76; |
|---|
| 260 |
this number must be smaller than 76). |
|---|
| 261 |
|
|---|
| 262 |
=item $j-E<gt>mime_decode; |
|---|
| 263 |
|
|---|
| 264 |
Decodes MIME-Header in Jcode object. |
|---|
| 265 |
|
|---|
| 266 |
You can retrieve the number of matches via $j->nmatch; |
|---|
| 267 |
|
|---|
| 268 |
=back |
|---|
| 269 |
|
|---|
| 270 |
=cut |
|---|
| 271 |
|
|---|
| 272 |
sub mime_encode{ |
|---|
| 273 |
my $self = shift; |
|---|
| 274 |
my $r_str = $self->[0]; |
|---|
| 275 |
my $lf = shift || "\n"; |
|---|
| 276 |
my $bpl = shift || 76; |
|---|
| 277 |
|
|---|
| 278 |
my ($trailing_crlf) = ($$r_str =~ /(\n|\r|\x0d\x0a)$/o); |
|---|
| 279 |
my $str = _mime_unstructured_header($$r_str, $lf, $bpl); |
|---|
| 280 |
not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o; |
|---|
| 281 |
$str; |
|---|
| 282 |
} |
|---|
| 283 |
|
|---|
| 284 |
|
|---|
| 285 |
|
|---|
| 286 |
|
|---|
| 287 |
|
|---|
| 288 |
|
|---|
| 289 |
sub _add_encoded_word { |
|---|
| 290 |
require MIME::Base64; |
|---|
| 291 |
my($str, $line, $bpl) = @_; |
|---|
| 292 |
my $result = ''; |
|---|
| 293 |
while (length($str)) { |
|---|
| 294 |
my $target = $str; |
|---|
| 295 |
$str = ''; |
|---|
| 296 |
if (length($line) + 22 + |
|---|
| 297 |
($target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o) * 8 > $bpl) { |
|---|
| 298 |
$line =~ s/[ \t\n\r]*$/\n/; |
|---|
| 299 |
$result .= $line; |
|---|
| 300 |
$line = ' '; |
|---|
| 301 |
} |
|---|
| 302 |
while (1) { |
|---|
| 303 |
my $iso_2022_jp = jcode($target, 'euc')->iso_2022_jp; |
|---|
| 304 |
if (my $count = ($iso_2022_jp =~ tr/\x80-\xff//d)){ |
|---|
| 305 |
$DEBUG and warn $count; |
|---|
| 306 |
$target = jcode($iso_2022_jp, 'iso_2022_jp')->euc; |
|---|
| 307 |
} |
|---|
| 308 |
my $encoded = '=?ISO-2022-JP?B?' . |
|---|
| 309 |
MIME::Base64::encode_base64($iso_2022_jp, '') |
|---|
| 310 |
. '?='; |
|---|
| 311 |
if (length($encoded) + length($line) > $bpl) { |
|---|
| 312 |
$target =~ |
|---|
| 313 |
s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o; |
|---|
| 314 |
$str = $1 . $str; |
|---|
| 315 |
} else { |
|---|
| 316 |
$line .= $encoded; |
|---|
| 317 |
last; |
|---|
| 318 |
} |
|---|
| 319 |
} |
|---|
| 320 |
} |
|---|
| 321 |
return $result . $line; |
|---|
| 322 |
} |
|---|
| 323 |
|
|---|
| 324 |
sub _mime_unstructured_header { |
|---|
| 325 |
my ($oldheader, $lf, $bpl) = @_; |
|---|
| 326 |
my(@words, @wordstmp, $i); |
|---|
| 327 |
my $header = ''; |
|---|
| 328 |
$oldheader =~ s/\s+$//; |
|---|
| 329 |
@wordstmp = split /\s+/, $oldheader; |
|---|
| 330 |
for ($i = 0; $i < $ |
|---|
| 331 |
if ($wordstmp[$i] !~ /^[\x21-\x7E]+$/ and |
|---|
| 332 |
$wordstmp[$i + 1] !~ /^[\x21-\x7E]+$/) { |
|---|
| 333 |
$wordstmp[$i + 1] = "$wordstmp[$i] $wordstmp[$i + 1]"; |
|---|
| 334 |
} else { |
|---|
| 335 |
push(@words, $wordstmp[$i]); |
|---|
| 336 |
} |
|---|
| 337 |
} |
|---|
| 338 |
push(@words, $wordstmp[-1]); |
|---|
| 339 |
for my $word (@words) { |
|---|
| 340 |
if ($word =~ /^[\x21-\x7E]+$/) { |
|---|
| 341 |
$header =~ /(?:.*\n)*(.*)/; |
|---|
| 342 |
if (length($1) + length($word) > $bpl) { |
|---|
| 343 |
$header .= "$lf $word"; |
|---|
| 344 |
} else { |
|---|
| 345 |
$header .= $word; |
|---|
| 346 |
} |
|---|
| 347 |
} else { |
|---|
| 348 |
$header = _add_encoded_word($word, $header, $bpl); |
|---|
| 349 |
} |
|---|
| 350 |
$header =~ /(?:.*\n)*(.*)/; |
|---|
| 351 |
if (length($1) == $bpl) { |
|---|
| 352 |
$header .= "$lf "; |
|---|
| 353 |
} else { |
|---|
| 354 |
$header .= ' '; |
|---|
| 355 |
} |
|---|
| 356 |
} |
|---|
| 357 |
$header =~ s/\n? $/\n/; |
|---|
| 358 |
$header; |
|---|
| 359 |
} |
|---|
| 360 |
|
|---|
| 361 |
|
|---|
| 362 |
|
|---|
| 363 |
|
|---|
| 364 |
|
|---|
| 365 |
|
|---|
| 366 |
|
|---|
| 367 |
sub mime_decode{ |
|---|
| 368 |
require MIME::Base64; |
|---|
| 369 |
my $self = shift; |
|---|
| 370 |
my $r_str = $self->[0]; |
|---|
| 371 |
my $re_lws = '(?:(?:\r|\n|\x0d\x0a)?[ \t])+'; |
|---|
| 372 |
my $re_ew = '=\?[Ii][Ss][Oo]-2022-[Jj][Pp]\?[Bb]\?([A-Za-z0-9+/]+=*)\?='; |
|---|
| 373 |
$$r_str =~ s/($re_ew)$re_lws(?=$re_ew)/$1/sgo; |
|---|
| 374 |
$$r_str =~ s/$re_lws/ /go; |
|---|
| 375 |
$self->[2] = |
|---|
| 376 |
($$r_str =~ |
|---|
| 377 |
s/$re_ew/jis_euc(MIME::Base64::decode_base64($1))/ego |
|---|
| 378 |
); |
|---|
| 379 |
$self; |
|---|
| 380 |
} |
|---|
| 381 |
|
|---|
| 382 |
|
|---|
| 383 |
=head2 Methods implemented by Jcode::H2Z |
|---|
| 384 |
|
|---|
| 385 |
Methods below are actually implemented in Jcode::H2Z. |
|---|
| 386 |
|
|---|
| 387 |
=over 4 |
|---|
| 388 |
|
|---|
| 389 |
=item $j-E<gt>h2z([$keep_dakuten]); |
|---|
| 390 |
|
|---|
| 391 |
Converts X201 kana (Hankaku) to X208 kana (Zenkaku). |
|---|
| 392 |
When $keep_dakuten is set, it leaves dakuten as is |
|---|
| 393 |
(That is, "ka + dakuten" is left as is instead of |
|---|
| 394 |
being converted to "ga") |
|---|
| 395 |
|
|---|
| 396 |
You can retrieve the number of matches via $j->nmatch; |
|---|
| 397 |
|
|---|
| 398 |
=item $j-E<gt>z2h; |
|---|
| 399 |
|
|---|
| 400 |
Converts X208 kana (Zenkaku) to X201 kana (Hankaku). |
|---|
| 401 |
|
|---|
| 402 |
You can retrieve the number of matches via $j->nmatch; |
|---|
| 403 |
|
|---|
| 404 |
=back |
|---|
| 405 |
|
|---|
| 406 |
=cut |
|---|
| 407 |
|
|---|
| 408 |
sub h2z { |
|---|
| 409 |
require Jcode::H2Z; |
|---|
| 410 |
my $self = shift; |
|---|
| 411 |
$self->[2] = Jcode::H2Z::h2z($self->[0], @_); |
|---|
| 412 |
return $self; |
|---|
| 413 |
} |
|---|
| 414 |
|
|---|
| 415 |
|
|---|
| 416 |
sub z2h { |
|---|
| 417 |
require Jcode::H2Z; |
|---|
| 418 |
my $self = shift; |
|---|
| 419 |
$self->[2] = &Jcode::H2Z::z2h($self->[0], @_); |
|---|
| 420 |
return $self; |
|---|
| 421 |
} |
|---|
| 422 |
|
|---|
| 423 |
|
|---|
| 424 |
=head2 Methods implemented in Jcode::Tr |
|---|
| 425 |
|
|---|
| 426 |
Methods here are actually implemented in Jcode::Tr. |
|---|
| 427 |
|
|---|
| 428 |
=over 4 |
|---|
| 429 |
|
|---|
| 430 |
=item $j-E<gt>tr($from, $to); |
|---|
| 431 |
|
|---|
| 432 |
Applies tr on Jcode object. $from and $to can contain EUC Japanese. |
|---|
| 433 |
|
|---|
| 434 |
You can retrieve the number of matches via $j->nmatch; |
|---|
| 435 |
|
|---|
| 436 |
=back |
|---|
| 437 |
|
|---|
| 438 |
=cut |
|---|
| 439 |
|
|---|
| 440 |
sub tr{ |
|---|
| 441 |
require Jcode::Tr; |
|---|
| 442 |
my $self = shift; |
|---|
| 443 |
$self->[2] = Jcode::Tr::tr($self->[0], @_); |
|---|
| 444 |
return $self; |
|---|
| 445 |
} |
|---|
| 446 |
|
|---|
| 447 |
|
|---|
| 448 |
|
|---|
| 449 |
|
|---|
| 450 |
|
|---|
| 451 |
use vars qw(%PKG_LOADED); |
|---|
| 452 |
sub load_module{ |
|---|
| 453 |
my $pkg = shift; |
|---|
| 454 |
return $pkg if $PKG_LOADED{$pkg}++; |
|---|
| 455 |
unless ($NOXS){ |
|---|
| 456 |
eval qq( require $pkg; ); |
|---|
| 457 |
unless ($@){ |
|---|
| 458 |
carp "$pkg loaded." if $DEBUG; |
|---|
| 459 |
return $pkg; |
|---|
| 460 |
} |
|---|
| 461 |
} |
|---|
| 462 |
$pkg .= "::NoXS"; |
|---|
| 463 |
eval qq( require $pkg; ); |
|---|
| 464 |
unless ($@){ |
|---|
| 465 |
carp "$pkg loaded" if $DEBUG; |
|---|
| 466 |
}else{ |
|---|
| 467 |
croak "Loading $pkg failed!"; |
|---|
| 468 |
} |
|---|
| 469 |
$pkg; |
|---|
| 470 |
} |
|---|
| 471 |
|
|---|
| 472 |
=head2 Methods implemented in Jcode::Unicode |
|---|
| 473 |
|
|---|
| 474 |
If your perl does not support XS (or you can't C<perl Makefile.PL>, |
|---|
| 475 |
Jcode::Unicode::NoXS will be used. |
|---|
| 476 |
|
|---|
| 477 |
See L<Jcode::Unicode> and L<Jcode::Unicode::NoXS> for details |
|---|
| 478 |
|
|---|
| 479 |
=over 4 |
|---|
| 480 |
|
|---|
| 481 |
=item $ucs2 = $j-E<gt>ucs2; |
|---|
| 482 |
|
|---|
| 483 |
Returns UCS2 (Raw Unicode) string. |
|---|
| 484 |
|
|---|
| 485 |
=item $ucs2 = $j-E<gt>utf8; |
|---|
| 486 |
|
|---|
| 487 |
Returns utf8 String. |
|---|
| 488 |
|
|---|
| 489 |
=back |
|---|
| 490 |
|
|---|
| 491 |
=cut |
|---|
| 492 |
|
|---|
| 493 |
sub ucs2{ |
|---|
| 494 |
load_module("Jcode::Unicode"); |
|---|
| 495 |
euc_ucs2(${$_[0]->[0]}); |
|---|
| 496 |
} |
|---|
| 497 |
|
|---|
| 498 |
sub utf8{ |
|---|
| 499 |
load_module("Jcode::Unicode"); |
|---|
| 500 |
euc_utf8(${$_[0]->[0]}); |
|---|
| 501 |
} |
|---|
| 502 |
|
|---|
| 503 |
=head2 Instance Variables |
|---|
| 504 |
|
|---|
| 505 |
If you need to access instance variables of Jcode object, use access |
|---|
| 506 |
methods below instead of directly accessing them (That's what OOP |
|---|
| 507 |
is all about) |
|---|
| 508 |
|
|---|
| 509 |
FYI, Jcode uses a ref to array instead of ref to hash (common way) to |
|---|
| 510 |
optimize speed (Actually you don't have to know as long as you use |
|---|
| 511 |
access methods instead; Once again, that's OOP) |
|---|
| 512 |
|
|---|
| 513 |
=over 4 |
|---|
| 514 |
|
|---|
| 515 |
=item $j-E<gt>r_str |
|---|
| 516 |
|
|---|
| 517 |
Reference to the EUC-coded String. |
|---|
| 518 |
|
|---|
| 519 |
=item $j-E<gt>icode |
|---|
| 520 |
|
|---|
| 521 |
Input charcode in recent operation. |
|---|
| 522 |
|
|---|
| 523 |
=item $j-E<gt>nmatch |
|---|
| 524 |
|
|---|
| 525 |
Number of matches (Used in $j->tr, etc.) |
|---|
| 526 |
|
|---|
| 527 |
=back |
|---|
| 528 |
|
|---|
| 529 |
=cut |
|---|
| 530 |
|
|---|
| 531 |
=head1 Subroutines |
|---|
| 532 |
|
|---|
| 533 |
=over 4 |
|---|
| 534 |
|
|---|
| 535 |
=item ($code, [$nmatch]) = getcode($str); |
|---|
| 536 |
|
|---|
| 537 |
Returns char code of $str. Return codes are as follows |
|---|
| 538 |
|
|---|
| 539 |
ascii Ascii (Contains no Japanese Code) |
|---|
| 540 |
binary Binary (Not Text File) |
|---|
| 541 |
euc EUC-JP |
|---|
| 542 |
sjis SHIFT_JIS |
|---|
| 543 |
jis JIS (ISO-2022-JP) |
|---|
| 544 |
ucs2 UCS2 (Raw Unicode) |
|---|
| 545 |
utf8 UTF8 |
|---|
| 546 |
|
|---|
| 547 |
When array context is used instead of scaler, it also returns how many |
|---|
| 548 |
character codes are found. As mentioned above, $str can be \$str |
|---|
| 549 |
instead. |
|---|
| 550 |
|
|---|
| 551 |
B<jcode.pl Users:> This function is 100% upper-conpatible with |
|---|
| 552 |
jcode::getcode() -- well, almost; |
|---|
| 553 |
|
|---|
| 554 |
* When its return value is an array, the order is the opposite; |
|---|
| 555 |
jcode::getcode() returns $nmatch first. |
|---|
| 556 |
|
|---|
| 557 |
* jcode::getcode() returns 'undef' when the number of EUC characters |
|---|
| 558 |
is equal to that of SJIS. Jcode::getcode() returns EUC. for |
|---|
| 559 |
Jcode.pm there is no in-betweens. |
|---|
| 560 |
|
|---|
| 561 |
=item Jcode::convert($str, [$ocode, $icode, $opt]); |
|---|
| 562 |
|
|---|
| 563 |
Converts $str to char code specified by $ocode. When $icode is specified |
|---|
| 564 |
also, it assumes $icode for input string instead of the one checked by |
|---|
| 565 |
getcode(). As mentioned above, $str can be \$str instead. |
|---|
| 566 |
|
|---|
| 567 |
B<jcode.pl Users:> This function is 100% upper-conpatible with |
|---|
| 568 |
jcode::convert() ! |
|---|
| 569 |
|
|---|
| 570 |
=back |
|---|
| 571 |
|
|---|
| 572 |
=cut |
|---|
| 573 |
|
|---|
| 574 |
sub getcode { |
|---|
| 575 |
my $thingy = shift; |
|---|
| 576 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 577 |
|
|---|
| 578 |
my ($code, $nmatch, $sjis, $euc, $utf8) = ("", 0, 0, 0, 0); |
|---|
| 579 |
if ($$r_str =~ /$RE{BIN}/o) { |
|---|
| 580 |
my $ucs2; |
|---|
| 581 |
$ucs2 += length($1) |
|---|
| 582 |
while $$r_str =~ /(\x00$RE{ASCII})+/go; |
|---|
| 583 |
if ($ucs2){ |
|---|
| 584 |
($code, $nmatch) = ('ucs2', $ucs2); |
|---|
| 585 |
}else{ |
|---|
| 586 |
($code, $nmatch) = ('binary', 0); |
|---|
| 587 |
} |
|---|
| 588 |
} |
|---|
| 589 |
elsif ($$r_str !~ /[\e\x80-\xff]/o) { |
|---|
| 590 |
($code, $nmatch) = ('ascii', 1); |
|---|
| 591 |
} |
|---|
| 592 |
elsif ($$r_str =~ |
|---|
| 593 |
m[ |
|---|
| 594 |
$RE{JIS_0208}|$RE{JIS_0212}|$RE{JIS_ASC}|$RE{JIS_KANA} |
|---|
| 595 |
]ox) |
|---|
| 596 |
{ |
|---|
| 597 |
($code, $nmatch) = ('jis', 1); |
|---|
| 598 |
} |
|---|
| 599 |
else { |
|---|
| 600 |
|
|---|
| 601 |
$sjis += length($1) |
|---|
| 602 |
while $$r_str =~ /((?:$RE{SJIS_C})+)/go; |
|---|
| 603 |
$euc += length($1) |
|---|
| 604 |
while $$r_str =~ /((?:$RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212})+)/go; |
|---|
| 605 |
$utf8 += length($1) |
|---|
| 606 |
while $$r_str =~ /((?:$RE{UTF8})+)/go; |
|---|
| 607 |
$nmatch = _max($utf8, $sjis, $euc); |
|---|
| 608 |
carp ">DEBUG:sjis = $sjis, euc = $euc, utf8 = $utf8" if $DEBUG >= 3; |
|---|
| 609 |
$code = |
|---|
| 610 |
($euc > $sjis and $euc > $utf8) ? 'euc' : |
|---|
| 611 |
($sjis > $euc and $sjis > $utf8) ? 'sjis' : |
|---|
| 612 |
($utf8 > $euc and $utf8 > $sjis) ? 'utf8' : undef; |
|---|
| 613 |
} |
|---|
| 614 |
return wantarray ? ($code, $nmatch) : $code; |
|---|
| 615 |
} |
|---|
| 616 |
|
|---|
| 617 |
sub convert{ |
|---|
| 618 |
my $thingy = shift; |
|---|
| 619 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 620 |
my ($ocode, $icode, $opt) = @_; |
|---|
| 621 |
|
|---|
| 622 |
my $nmatch; |
|---|
| 623 |
($icode, $nmatch) = getcode($r_str) unless $icode; |
|---|
| 624 |
|
|---|
| 625 |
return $$r_str if $icode eq $ocode and !defined $opt; |
|---|
| 626 |
|
|---|
| 627 |
no strict qw(refs); |
|---|
| 628 |
my $method; |
|---|
| 629 |
|
|---|
| 630 |
|
|---|
| 631 |
|
|---|
| 632 |
load_module("Jcode::Unicode") if $icode =~ /ucs2|utf8/o; |
|---|
| 633 |
if ($icode and defined &{$method = $icode . "_euc"}){ |
|---|
| 634 |
carp "Dispatching \&$method" if $DEBUG >= 2; |
|---|
| 635 |
&{$method}($r_str) ; |
|---|
| 636 |
} |
|---|
| 637 |
|
|---|
| 638 |
|
|---|
| 639 |
|
|---|
| 640 |
if ($opt){ |
|---|
| 641 |
my $cmd = ($opt =~ /^z/o) ? "h2z" : ($opt =~ /^h/o) ? "z2h" : undef; |
|---|
| 642 |
if ($cmd){ |
|---|
| 643 |
require Jcode::H2Z; |
|---|
| 644 |
&{'Jcode::H2Z::' . $cmd}($r_str); |
|---|
| 645 |
} |
|---|
| 646 |
} |
|---|
| 647 |
|
|---|
| 648 |
|
|---|
| 649 |
|
|---|
| 650 |
load_module("Jcode::Unicode") if $ocode =~ /ucs2|utf8/o; |
|---|
| 651 |
if ($ocode and defined &{$method = "euc_" . $ocode}){ |
|---|
| 652 |
carp "Dispatching \&$method" if $DEBUG >= 2; |
|---|
| 653 |
&{$method}($r_str) ; |
|---|
| 654 |
} |
|---|
| 655 |
$$r_str; |
|---|
| 656 |
} |
|---|
| 657 |
|
|---|
| 658 |
|
|---|
| 659 |
|
|---|
| 660 |
sub jis_euc { |
|---|
| 661 |
my $thingy = shift; |
|---|
| 662 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 663 |
$$r_str =~ s( |
|---|
| 664 |
($RE{JIS_0212}|$RE{JIS_0208}|$RE{JIS_ASC}|$RE{JIS_KANA}) |
|---|
| 665 |
([^\e]*) |
|---|
| 666 |
) |
|---|
| 667 |
{ |
|---|
| 668 |
my ($esc, $str) = ($1, $2); |
|---|
| 669 |
if ($esc !~ /$RE{JIS_ASC}/o) { |
|---|
| 670 |
$str =~ tr/\x21-\x7e/\xa1-\xfe/; |
|---|
| 671 |
if ($esc =~ /$RE{JIS_KANA}/o) { |
|---|
| 672 |
$str =~ s/([\xa1-\xdf])/\x8e$1/og; |
|---|
| 673 |
} |
|---|
| 674 |
elsif ($esc =~ /$RE{JIS_0212}/o) { |
|---|
| 675 |
$str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; |
|---|
| 676 |
} |
|---|
| 677 |
} |
|---|
| 678 |
$str; |
|---|
| 679 |
}geox; |
|---|
| 680 |
$$r_str; |
|---|
| 681 |
} |
|---|
| 682 |
|
|---|
| 683 |
|
|---|
| 684 |
|
|---|
| 685 |
|
|---|
| 686 |
|
|---|
| 687 |
|
|---|
| 688 |
|
|---|
| 689 |
|
|---|
| 690 |
sub euc_jis{ |
|---|
| 691 |
my $thingy = shift; |
|---|
| 692 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 693 |
$$r_str =~ s{ |
|---|
| 694 |
((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) |
|---|
| 695 |
}{ |
|---|
| 696 |
my $str = $1; |
|---|
| 697 |
my $esc = |
|---|
| 698 |
( $str =~ tr/\x8E//d ) ? $ESC{KANA} : |
|---|
| 699 |
( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} : |
|---|
| 700 |
$ESC{JIS_0208}; |
|---|
| 701 |
$str =~ tr/\xA1-\xFE/\x21-\x7E/; |
|---|
| 702 |
$esc . $str . $ESC{ASC}; |
|---|
| 703 |
}geox; |
|---|
| 704 |
$$r_str =~ |
|---|
| 705 |
s/\Q$ESC{ASC}\E |
|---|
| 706 |
(\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox; |
|---|
| 707 |
$$r_str; |
|---|
| 708 |
} |
|---|
| 709 |
|
|---|
| 710 |
|
|---|
| 711 |
|
|---|
| 712 |
my %_S2E = (); |
|---|
| 713 |
my %_E2S = (); |
|---|
| 714 |
|
|---|
| 715 |
sub sjis_euc { |
|---|
| 716 |
my $thingy = shift; |
|---|
| 717 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 718 |
$$r_str =~ s( |
|---|
| 719 |
($RE{SJIS_C}|$RE{SJIS_KANA}) |
|---|
| 720 |
) |
|---|
| 721 |
{ |
|---|
| 722 |
my $str = $1; |
|---|
| 723 |
unless ($_S2E{$1}){ |
|---|
| 724 |
my ($c1, $c2) = unpack('CC', $str); |
|---|
| 725 |
if (0xa1 <= $c1 && $c1 <= 0xdf) { |
|---|
| 726 |
$c2 = $c1; |
|---|
| 727 |
$c1 = 0x8e; |
|---|
| 728 |
} elsif (0x9f <= $c2) { |
|---|
| 729 |
$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60); |
|---|
| 730 |
$c2 += 2; |
|---|
| 731 |
} else { |
|---|
| 732 |
$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61); |
|---|
| 733 |
$c2 += 0x60 + ($c2 < 0x7f); |
|---|
| 734 |
} |
|---|
| 735 |
$_S2E{$str} = pack('CC', $c1, $c2); |
|---|
| 736 |
} |
|---|
| 737 |
$_S2E{$str}; |
|---|
| 738 |
}geox; |
|---|
| 739 |
$$r_str; |
|---|
| 740 |
} |
|---|
| 741 |
|
|---|
| 742 |
|
|---|
| 743 |
|
|---|
| 744 |
sub euc_sjis { |
|---|
| 745 |
my $thingy = shift; |
|---|
| 746 |
my $r_str = ref $thingy ? $thingy : \$thingy; |
|---|
| 747 |
$$r_str =~ s( |
|---|
| 748 |
($RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212}) |
|---|
| 749 |
) |
|---|
| 750 |
{ |
|---|
| 751 |
my $str = $1; |
|---|
| 752 |
unless ($_E2S{$str}){ |
|---|
| 753 |
my ($c1, $c2) = unpack('CC', $str); |
|---|
| 754 |
if ($c1 == 0x8e) { |
|---|
| 755 |
$_E2S{$str} = chr($c2); |
|---|
| 756 |
} elsif ($c1 == 0x8f) { |
|---|
| 757 |
$_E2S{$str} = $CHARCODE{UNDEF_SJIS}; |
|---|
| 758 |
}else { |
|---|
| 759 |
if ($c1 % 2) { |
|---|
| 760 |
$c1 = ($c1>>1) + ($c1 < 0xdf ? 0x31 : 0x71); |
|---|
| 761 |
$c2 -= 0x60 + ($c2 < 0xe0); |
|---|
| 762 |
} else { |
|---|
| 763 |
$c1 = ($c1>>1) + ($c1 < 0xdf ? 0x30 : 0x70); |
|---|
| 764 |
$c2 -= 2; |
|---|
| 765 |
} |
|---|
| 766 |
$_E2S{$str} = pack('CC', $c1, $c2); |
|---|
| 767 |
} |
|---|
| 768 |
} |
|---|
| 769 |
$_E2S{$str}; |
|---|
| 770 |
}geox; |
|---|
| 771 |
$$r_str; |
|---|
| 772 |
} |
|---|
| 773 |
|
|---|
| 774 |
|
|---|
| 775 |
|
|---|
| 776 |
|
|---|
| 777 |
|
|---|
| 778 |
sub _max { |
|---|
| 779 |
my $result = shift; |
|---|
| 780 |
for my $n (@_){ |
|---|
| 781 |
$result = $n if $n > $result; |
|---|
| 782 |
} |
|---|
| 783 |
return $result; |
|---|
| 784 |
} |
|---|
| 785 |
|
|---|
| 786 |
1; |
|---|
| 787 |
|
|---|
| 788 |
__END__ |
|---|
| 789 |
|
|---|
| 790 |
=head1 BUGS |
|---|
| 791 |
|
|---|
| 792 |
Unicode support by Jcode is far from efficient! |
|---|
| 793 |
|
|---|
| 794 |
=head1 IN FUTURE |
|---|
| 795 |
|
|---|
| 796 |
Hopefully Jcode will be superceded by Encode module that is part of |
|---|
| 797 |
the standard module on Perl 5.7 and up |
|---|
| 798 |
|
|---|
| 799 |
=head1 ACKNOWLEDGEMENTS |
|---|
| 800 |
|
|---|
| 801 |
This package owes a lot in motivation, design, and code, to the jcode.pl |
|---|
| 802 |
for Perl4 by Kazumasa Utashiro <utashiro@iij.ad.jp>. |
|---|
| 803 |
|
|---|
| 804 |
Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp> has helped me polish regexp from the |
|---|
| 805 |
very first stage of development. |
|---|
| 806 |
|
|---|
| 807 |
And folks at Jcode Mailing list <jcode5@ring.gr.jp>. Without them, I |
|---|
| 808 |
couldn't have coded this far. |
|---|
| 809 |
|
|---|
| 810 |
=head1 SEE ALSO |
|---|
| 811 |
|
|---|
| 812 |
L<Jcode::Unicode> |
|---|
| 813 |
|
|---|
| 814 |
L<Jcode::Unicode::NoXS> |
|---|
| 815 |
|
|---|
| 816 |
http://www.iana.org/assignments/character-sets |
|---|
| 817 |
|
|---|
| 818 |
L<Encode> |
|---|
| 819 |
|
|---|
| 820 |
=head1 COPYRIGHT |
|---|
| 821 |
|
|---|
| 822 |
Copyright 1999 Dan Kogai <dankogai@dan.co.jp> |
|---|
| 823 |
|
|---|
| 824 |
This library is free software; you can redistribute it |
|---|
| 825 |
and/or modify it under the same terms as Perl itself. |
|---|
| 826 |
|
|---|
| 827 |
=cut |
|---|
| 828 |
|
|---|