root/tags/1linebbsv2-1.9/Jcode.pm

Revision 113, 18.1 kB (checked in by akiyan, 2 years ago)

maintenance

Line 
1 #
2 # $Id: Jcode.pm,v 0.83 2003/03/16 16:15:34 dankogai Exp dankogai $
3 #
4
5 =head1 NAME
6
7 Jcode - Japanese Charset Handler
8
9 =head1 SYNOPSIS
10
11  use Jcode;
12  #
13  # traditional
14  Jcode::convert(\$str, $ocode, $icode, "z");
15  # or OOP!
16  print Jcode->new($str)->h2z->tr($from, $to)->utf8;
17
18 =cut
19
20 =head1 DESCRIPTION
21
22 Jcode.pm supports both object and traditional approach. 
23 With object approach, you can go like;
24
25 $iso_2022_jp = Jcode->new($str)->h2z->jis;
26
27 Which is more elegant than;
28
29 $iso_2022_jp = &jcode::convert(\$str,'jis',jcode::getcode(\str), "z");
30
31 For those unfamiliar with objects, Jcode.pm still supports getcode()
32 and convert().
33
34 =cut
35
36 package Jcode;
37 use 5.004;
38 use Carp;
39 use strict;
40 use vars qw($RCSID $VERSION $DEBUG);
41
42 $RCSID = q$Id: Jcode.pm,v 0.83 2003/03/16 16:15:34 dankogai Exp dankogai $;
43 $VERSION = do { my @r = (q$Revision: 0.83 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
44 $DEBUG = 0;
45
46 use Exporter;
47 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
48 @ISA         = qw(Exporter);
49 @EXPORT      = qw(jcode getcode);
50 @EXPORT_OK   = qw($RCSID $VERSION $DEBUG);
51 %EXPORT_TAGS = ( all => [ @EXPORT_OK, @EXPORT ] );
52
53
54 use vars qw($USE_CACHE $NOXS);
55
56 $USE_CACHE = 1;
57 $NOXS = 0;
58
59 print $RCSID, "\n" if $DEBUG;
60
61 use Jcode::Constants qw(:all);
62
63 use overload
64     q("") => sub { ${$_[0]->[0]} },
65     q(==) => sub {overload::StrVal($_[0]) eq overload::StrVal($_[1])},
66     q(=)  => sub { $_[0]->set( $_[1] ) },
67     q(.=) => sub { $_[0]->append( $_[1] ) },
68     fallback => 1,
69     ;
70
71 =head1 Methods
72
73 Methods mentioned here all return Jcode object unless otherwise mentioned.
74
75 =over 4
76
77 =item $j = Jcode-E<gt>new($str [, $icode]);
78
79 Creates Jcode object $j from $str.  Input code is automatically checked
80 unless you explicitly set $icode. For available charset, see L<getcode>
81 below.
82
83 The object keeps the string in EUC format enternaly.  When the object
84 itself is evaluated, it returns the EUC-converted string so you can
85 "print $j;" without calling access method if you are using EUC
86 (thanks to function overload).
87
88 =item Passing Reference
89
90 Instead of scalar value, You can use reference as
91
92 Jcode->new(\$str);
93
94 This saves time a little bit.  In exchange of the value of $str being
95 converted. (In a way, $str is now "tied" to jcode object).
96
97 =item $j-E<gt>set($str [, $icode]);
98
99 Sets $j's internal string to $str.  Handy when you use Jcode object repeatedly
100 (saves time and memory to create object).
101
102  # converts mailbox to SJIS format
103  my $jconv = new Jcode;
104  $/ = 00;
105  while(&lt;&gt;){
106      print $jconv->set(\$_)->mime_decode->sjis;
107  }
108
109 =item $j-E<gt>append($str [, $icode]);
110
111 Appends $str to $j's internal string.
112
113 =back
114
115 =cut
116
117 sub new {
118     my $class = shift;
119     my ($thingy, $icode) = @_;
120     my $r_str = ref $thingy ? $thingy : \$thingy;
121     my $nmatch;
122     ($icode, $nmatch) = getcode($r_str) unless $icode;
123     convert($r_str, 'euc', $icode);
124     my $self = [
125         $r_str,
126         $icode,
127         $nmatch,
128     ];
129     carp "Object of class $class created" if $DEBUG >= 2;
130     bless $self, $class;
131 }
132
133 sub r_str  { $_[0]->[0] }
134 sub icode  { $_[0]->[1] }
135 sub nmatch { $_[0]->[2] }
136
137 sub set {
138     my $self = shift;
139     my ($thingy, $icode) = @_;
140     my $r_str = ref $thingy ? $thingy : \$thingy;
141     my $nmatch;
142     ($icode, $nmatch) = getcode($r_str) unless $icode;
143     convert($r_str, 'euc', $icode);
144     $self->[0] = $r_str;
145     $self->[1] = $icode;
146     $self->[2] = $nmatch;
147     return $self;
148 }
149
150 sub append {
151     my $self = shift;
152     my ($thingy, $icode) = @_;
153     my $r_str = ref $thingy ? $thingy : \$thingy;
154     my $nmatch;
155     ($icode, $nmatch) = getcode($r_str) unless $icode;
156     convert($r_str, 'euc', $icode);
157     ${$self->[0]} .= $$r_str;
158     $self->[1] = $icode;
159     $self->[2] = $nmatch;
160     return $self;
161 }
162
163 =over 4
164
165 =item $j = jcode($str [, $icode]);
166
167 shortcut for Jcode->new() so you can go like;
168
169 $sjis = jcode($str)->sjis;
170
171 =item $euc = $j-E<gt>euc;
172
173 =item $jis = $j-E<gt>jis;
174
175 =item $sjis = $j-E<gt>sjis;
176
177 What you code is what you get :)
178
179 =item $iso_2022_jp = $j-E<gt>iso_2022_jp
180
181 Same as $j->z2h->jis. 
182 Hankaku Kanas are forcibly converted to Zenkaku.
183
184 =back
185
186 =cut
187
188 sub jcode { return Jcode->new(@_) }
189 sub euc   { return ${$_[0]->[0]} }
190 sub jis   { return  &euc_jis(${$_[0]->[0]})}
191 sub sjis  { return &euc_sjis(${$_[0]->[0]})}
192 sub iso_2022_jp{return $_[0]->h2z->jis}
193
194 =over 4
195
196 =item [@lines =] $jcode-E<gt>jfold([$bytes_per_line, $newline_str]);
197
198 folds lines in jcode string every $bytes_per_line (default: 72)
199 in a way that does not clobber the multibyte string.
200 (Sorry, no Kinsoku done!)
201 with a newline string spified by $newline_str (default: \n). 
202
203 =back
204
205 =cut
206
207 sub jfold{
208     my $self = shift;
209     my ($bpl, $nl) = @_;
210     $bpl ||= 72;
211     $nl  ||= "\n";
212     my $r_str = $self->[0];
213     my (@lines, $len, $i);
214     while ($$r_str =~
215            m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo)
216     {
217         if ($len + length($1) > $bpl){ # fold!
218             $i++;
219             $len = 0;
220         }
221         $lines[$i] .= $1;
222         $len += length($1);
223     }
224     defined($lines[$i]) or pop @lines;
225     $$r_str = join($nl, @lines);
226     return wantarray ? @lines : $self;
227 }
228
229 =pod
230
231 =over 4
232
233 =item $length = $jcode-E<gt>jlength();
234
235 returns character length properly, rather than byte length.
236
237 =back
238
239 =cut
240
241 sub jlength {
242     my $self = shift;
243     my $r_str = $self->[0];
244     return scalar (my @char = $$r_str =~ m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo);
245 }
246
247 =head2 Methods that use MIME::Base64
248
249 To use methods below, you need MIME::Base64.  To install, simply
250
251    perl -MCPAN -e 'CPAN::Shell->install("MIME::Base64")'
252
253 =over 4
254
255 =item $mime_header = $j-E<gt>mime_encode([$lf, $bpl]);
256
257 Converts $str to MIME-Header documented in RFC1522.
258 When $lf is specified, it uses $lf to fold line (default: \n).
259 When $bpl is specified, it uses $bpl for the number of bytes (default: 76;
260 this number must be smaller than 76).
261
262 =item $j-E<gt>mime_decode;
263
264 Decodes MIME-Header in Jcode object.
265
266 You can retrieve the number of matches via $j->nmatch;
267
268 =back
269
270 =cut
271
272 sub mime_encode{
273     my $self = shift;
274     my $r_str = $self->[0];
275     my $lf  = shift || "\n";
276     my $bpl = shift || 76;
277
278     my ($trailing_crlf) = ($$r_str =~ /(\n|\r|\x0d\x0a)$/o);
279     my $str  = _mime_unstructured_header($$r_str, $lf, $bpl);
280     not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o;
281     $str;
282 }
283
284 #
285 # shamelessly stolen from
286 # http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64
287 #
288
289 sub _add_encoded_word {
290     require MIME::Base64;
291     my($str, $line, $bpl) = @_;
292     my $result = '';
293     while (length($str)) {
294         my $target = $str;
295         $str = '';
296         if (length($line) + 22 +
297             ($target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o) * 8 > $bpl) {
298             $line =~ s/[ \t\n\r]*$/\n/;
299             $result .= $line;
300             $line = ' ';
301         }
302         while (1) {
303             my $iso_2022_jp = jcode($target, 'euc')->iso_2022_jp;
304             if (my $count = ($iso_2022_jp =~ tr/\x80-\xff//d)){
305                 $DEBUG and warn $count;
306                 $target = jcode($iso_2022_jp, 'iso_2022_jp')->euc;
307             }
308             my $encoded = '=?ISO-2022-JP?B?' .
309               MIME::Base64::encode_base64($iso_2022_jp, '')
310                   . '?=';
311             if (length($encoded) + length($line) > $bpl) {
312                 $target =~
313                     s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o;
314                 $str = $1 . $str;
315             } else {
316                 $line .= $encoded;
317                 last;
318             }
319         }
320     }
321     return $result . $line;
322 }
323
324 sub _mime_unstructured_header {
325     my ($oldheader, $lf, $bpl) = @_;
326     my(@words, @wordstmp, $i);
327     my $header = '';
328     $oldheader =~ s/\s+$//;
329     @wordstmp = split /\s+/, $oldheader;
330     for ($i = 0; $i < $#wordstmp; $i++) {
331         if ($wordstmp[$i] !~ /^[\x21-\x7E]+$/ and
332             $wordstmp[$i + 1] !~ /^[\x21-\x7E]+$/) {
333             $wordstmp[$i + 1] = "$wordstmp[$i] $wordstmp[$i + 1]";
334         } else {
335             push(@words, $wordstmp[$i]);
336         }
337     }
338     push(@words, $wordstmp[-1]);
339     for my $word (@words) {
340         if ($word =~ /^[\x21-\x7E]+$/) {
341             $header =~ /(?:.*\n)*(.*)/;
342             if (length($1) + length($word) > $bpl) {
343                 $header .= "$lf $word";
344             } else {
345                 $header .= $word;
346             }
347         } else {
348             $header = _add_encoded_word($word, $header, $bpl);
349         }
350         $header =~ /(?:.*\n)*(.*)/;
351         if (length($1) == $bpl) {
352             $header .= "$lf ";
353         } else {
354             $header .= ' ';
355         }
356     }
357     $header =~ s/\n? $/\n/;
358     $header;
359 }
360
361 # see http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64
362 #$lws = '(?:(?:\x0d\x0a)?[ \t])+';
363 #$ew_regex = '=\?ISO-2022-JP\?B\?([A-Za-z0-9+/]+=*)\?=';
364 #$str =~ s/($ew_regex)$lws(?=$ew_regex)/$1/gio;
365 #$str =~ s/$lws/ /go; $str =~ s/$ew_regex/decode_base64($1)/egio;
366
367 sub mime_decode{
368     require MIME::Base64; # not use
369     my $self = shift;
370     my $r_str = $self->[0];
371     my $re_lws = '(?:(?:\r|\n|\x0d\x0a)?[ \t])+';
372     my $re_ew = '=\?[Ii][Ss][Oo]-2022-[Jj][Pp]\?[Bb]\?([A-Za-z0-9+/]+=*)\?=';
373     $$r_str =~ s/($re_ew)$re_lws(?=$re_ew)/$1/sgo;
374     $$r_str =~ s/$re_lws/ /go;
375     $self->[2] =
376         ($$r_str =~
377          s/$re_ew/jis_euc(MIME::Base64::decode_base64($1))/ego
378          );
379     $self;
380 }
381
382
383 =head2 Methods implemented by Jcode::H2Z
384
385 Methods below are actually implemented in Jcode::H2Z.
386
387 =over 4
388
389 =item $j-E<gt>h2z([$keep_dakuten]);
390
391 Converts X201 kana (Hankaku) to X208 kana (Zenkaku). 
392 When $keep_dakuten is set, it leaves dakuten as is
393 (That is, "ka + dakuten" is left as is instead of
394 being converted to "ga")
395
396 You can retrieve the number of matches via $j->nmatch;
397
398 =item $j-E<gt>z2h;
399
400 Converts X208 kana (Zenkaku) to X201 kana (Hankaku).
401
402 You can retrieve the number of matches via $j->nmatch;
403
404 =back
405
406 =cut
407
408 sub h2z {
409     require Jcode::H2Z; # not use
410     my $self = shift;
411     $self->[2] = Jcode::H2Z::h2z($self->[0], @_);
412     return $self;
413 }
414
415
416 sub z2h {
417     require Jcode::H2Z; # not use
418     my $self = shift;
419     $self->[2] =  &Jcode::H2Z::z2h($self->[0], @_);
420     return $self;
421 }
422
423
424 =head2 Methods implemented in Jcode::Tr
425
426 Methods here are actually implemented in Jcode::Tr.
427
428 =over 4
429
430 =item  $j-E<gt>tr($from, $to);
431
432 Applies tr on Jcode object. $from and $to can contain EUC Japanese.
433
434 You can retrieve the number of matches via $j->nmatch;
435
436 =back
437
438 =cut
439
440 sub tr{
441     require Jcode::Tr; # not use
442     my $self = shift;
443     $self->[2] = Jcode::Tr::tr($self->[0], @_);
444     return $self;
445 }
446
447 #
448 # load needed module depending on the configuration just once!
449 #
450
451 use vars qw(%PKG_LOADED);
452 sub load_module{
453     my $pkg = shift;
454     return $pkg if $PKG_LOADED{$pkg}++;
455     unless ($NOXS){
456         eval qq( require $pkg; );
457         unless ($@){
458             carp "$pkg loaded." if $DEBUG;
459             return $pkg;
460         }
461     }
462     $pkg .= "::NoXS";
463     eval qq( require $pkg; );
464     unless ($@){
465         carp "$pkg loaded" if $DEBUG;
466     }else{
467         croak "Loading $pkg failed!";
468     }
469     $pkg;
470 }
471
472 =head2 Methods implemented in Jcode::Unicode
473
474 If your perl does not support XS (or you can't C<perl Makefile.PL>,
475 Jcode::Unicode::NoXS will be used.
476
477 See L<Jcode::Unicode> and L<Jcode::Unicode::NoXS> for details
478
479 =over 4
480
481 =item $ucs2 = $j-E<gt>ucs2;
482
483 Returns UCS2 (Raw Unicode) string.
484
485 =item $ucs2 = $j-E<gt>utf8;
486
487 Returns utf8 String.
488
489 =back
490
491 =cut
492
493 sub ucs2{
494     load_module("Jcode::Unicode");
495     euc_ucs2(${$_[0]->[0]});
496 }
497
498 sub utf8{
499     load_module("Jcode::Unicode");
500     euc_utf8(${$_[0]->[0]});
501 }
502
503 =head2 Instance Variables
504
505 If you need to access instance variables of Jcode object, use access
506 methods below instead of directly accessing them (That's what OOP
507 is all about)
508
509 FYI, Jcode uses a ref to array instead of ref to hash (common way) to
510 optimize speed (Actually you don't have to know as long as you use
511 access methods instead; Once again, that's OOP)
512
513 =over 4
514
515 =item $j-E<gt>r_str
516
517 Reference to the EUC-coded String.
518
519 =item $j-E<gt>icode
520
521 Input charcode in recent operation.
522
523 =item $j-E<gt>nmatch
524
525 Number of matches (Used in $j->tr, etc.)
526
527 =back
528
529 =cut
530
531 =head1 Subroutines
532
533 =over 4
534
535 =item ($code, [$nmatch]) = getcode($str);
536
537 Returns char code of $str. Return codes are as follows
538
539  ascii   Ascii (Contains no Japanese Code)
540  binary  Binary (Not Text File)
541  euc     EUC-JP
542  sjis    SHIFT_JIS
543  jis     JIS (ISO-2022-JP)
544  ucs2    UCS2 (Raw Unicode)
545  utf8    UTF8
546
547 When array context is used instead of scaler, it also returns how many
548 character codes are found.  As mentioned above, $str can be \$str
549 instead.
550
551 B<jcode.pl Users:>  This function is 100% upper-conpatible with
552 jcode::getcode() -- well, almost;
553
554  * When its return value is an array, the order is the opposite;
555    jcode::getcode() returns $nmatch first.
556
557  * jcode::getcode() returns 'undef' when the number of EUC characters
558    is equal to that of SJIS.  Jcode::getcode() returns EUC.  for
559    Jcode.pm there is no in-betweens.
560
561 =item Jcode::convert($str, [$ocode, $icode, $opt]);
562
563 Converts $str to char code specified by $ocode.  When $icode is specified
564 also, it assumes $icode for input string instead of the one checked by
565 getcode(). As mentioned above, $str can be \$str instead.
566
567 B<jcode.pl Users:>  This function is 100% upper-conpatible with
568 jcode::convert() !
569
570 =back
571
572 =cut
573
574 sub getcode {
575     my $thingy = shift;
576     my $r_str = ref $thingy ? $thingy : \$thingy;
577
578     my ($code, $nmatch, $sjis, $euc, $utf8) = ("", 0, 0, 0, 0);
579     if ($$r_str =~ /$RE{BIN}/o) {       # 'binary'
580         my $ucs2;
581         $ucs2 += length($1)
582             while $$r_str =~ /(\x00$RE{ASCII})+/go;
583         if ($ucs2){      # smells like raw unicode
584             ($code, $nmatch) = ('ucs2', $ucs2);
585         }else{
586             ($code, $nmatch) = ('binary', 0);
587          }
588     }
589     elsif ($$r_str !~ /[\e\x80-\xff]/o) {       # not Japanese
590         ($code, $nmatch) = ('ascii', 1);
591     }                           # 'jis'
592     elsif ($$r_str =~
593            m[
594              $RE{JIS_0208}|$RE{JIS_0212}|$RE{JIS_ASC}|$RE{JIS_KANA}
595            ]ox)
596     {
597         ($code, $nmatch) = ('jis', 1);
598     }
599     else { # should be euc|sjis|utf8
600         # use of (?:) by Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp>
601         $sjis += length($1)
602             while $$r_str =~ /((?:$RE{SJIS_C})+)/go;
603         $euc  += length($1)
604             while $$r_str =~ /((?:$RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212})+)/go;
605         $utf8  += length($1)
606             while $$r_str =~ /((?:$RE{UTF8})+)/go;
607         $nmatch = _max($utf8, $sjis, $euc);
608         carp ">DEBUG:sjis = $sjis, euc = $euc, utf8 = $utf8" if $DEBUG >= 3;
609         $code =
610             ($euc > $sjis and $euc > $utf8) ? 'euc' :
611                 ($sjis > $euc and $sjis > $utf8) ? 'sjis' :
612                     ($utf8 > $euc and $utf8 > $sjis) ? 'utf8' : undef;
613     }
614     return wantarray ? ($code, $nmatch) : $code;
615 }
616
617 sub convert{
618     my $thingy = shift;
619     my $r_str = ref $thingy ? $thingy : \$thingy;
620     my ($ocode, $icode, $opt) = @_;
621
622     my $nmatch;
623     ($icode, $nmatch) = getcode($r_str) unless $icode;
624
625     return $$r_str if $icode eq $ocode and !defined $opt; # do nothin'
626
627     no strict qw(refs);
628     my $method;
629
630     # convert to EUC
631
632     load_module("Jcode::Unicode") if $icode =~ /ucs2|utf8/o;
633     if ($icode and defined &{$method = $icode . "_euc"}){
634         carp "Dispatching \&$method" if $DEBUG >= 2;
635         &{$method}($r_str) ;
636     }
637
638     # h2z or z2h
639
640     if ($opt){
641         my $cmd = ($opt =~ /^z/o) ? "h2z" : ($opt =~ /^h/o) ? "z2h" : undef;
642         if ($cmd){
643             require Jcode::H2Z;
644             &{'Jcode::H2Z::' . $cmd}($r_str);
645         }
646     }
647
648     # convert to $ocode
649
650     load_module("Jcode::Unicode") if $ocode =~ /ucs2|utf8/o;
651     if ($ocode and defined &{$method = "euc_" . $ocode}){
652         carp "Dispatching \&$method" if $DEBUG >= 2;
653         &{$method}($r_str) ;
654     }
655     $$r_str;
656 }
657
658 # JIS<->EUC
659
660 sub jis_euc {
661     my $thingy = shift;
662     my $r_str = ref $thingy ? $thingy : \$thingy;
663     $$r_str =~ s(
664                  ($RE{JIS_0212}|$RE{JIS_0208}|$RE{JIS_ASC}|$RE{JIS_KANA})
665                  ([^\e]*)
666                  )
667     {
668         my ($esc, $str) = ($1, $2);
669         if ($esc !~ /$RE{JIS_ASC}/o) {
670             $str =~ tr/\x21-\x7e/\xa1-\xfe/;
671             if ($esc =~ /$RE{JIS_KANA}/o) {
672                 $str =~ s/([\xa1-\xdf])/\x8e$1/og;
673             }
674             elsif ($esc =~ /$RE{JIS_0212}/o) {
675                 $str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
676             }
677         }
678         $str;
679     }geox;
680     $$r_str;
681 }
682
683 #
684 # euc_jis
685 #
686 # Based upon the contribution of
687 # Kazuto Ichimura <ichimura@shimada.nuee.nagoya-u.ac.jp>
688 # optimized by <ohzaki@iod.ricoh.co.jp>
689
690 sub euc_jis{
691     my $thingy = shift;
692     my $r_str = ref $thingy ? $thingy : \$thingy;
693     $$r_str =~ s{
694         ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+)
695         }{
696             my $str = $1;
697             my $esc =
698                 ( $str =~ tr/\x8E//d ) ? $ESC{KANA} :
699                     ( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} :
700                         $ESC{JIS_0208};
701             $str =~ tr/\xA1-\xFE/\x21-\x7E/;
702             $esc . $str . $ESC{ASC};
703         }geox;
704     $$r_str =~
705         s/\Q$ESC{ASC}\E
706             (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox;
707     $$r_str;
708 }
709
710 # EUC<->SJIS
711
712 my %_S2E = ();
713 my %_E2S = ();
714
715 sub sjis_euc {
716     my $thingy = shift;
717     my $r_str = ref $thingy ? $thingy : \$thingy;
718     $$r_str =~ s(
719                  ($RE{SJIS_C}|$RE{SJIS_KANA})
720              )
721     {
722         my $str = $1;
723         unless ($_S2E{$1}){
724             my ($c1, $c2) = unpack('CC', $str);
725             if (0xa1 <= $c1 && $c1 <= 0xdf) {
726                 $c2 = $c1;
727                 $c1 = 0x8e;
728             } elsif (0x9f <= $c2) {
729                 $c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60);
730                 $c2 += 2;
731             } else {
732                 $c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61);
733                 $c2 += 0x60 + ($c2 < 0x7f);
734             }
735             $_S2E{$str} = pack('CC', $c1, $c2);
736         }
737         $_S2E{$str};
738     }geox;
739     $$r_str;
740 }
741
742 #
743
744 sub euc_sjis {
745     my $thingy = shift;
746     my $r_str = ref $thingy ? $thingy : \$thingy;
747     $$r_str =~ s(
748                  ($RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212})
749                  )
750     {
751         my $str = $1;
752         unless ($_E2S{$str}){
753             my ($c1, $c2) = unpack('CC', $str);
754             if ($c1 == 0x8e) {          # SS2
755                 $_E2S{$str} = chr($c2);
756             } elsif ($c1 == 0x8f) {     # SS3
757                 $_E2S{$str} = $CHARCODE{UNDEF_SJIS};
758             }else { #SS1 or X0208
759                 if ($c1 % 2) {
760                     $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x31 : 0x71);
761                     $c2 -= 0x60 + ($c2 < 0xe0);
762                 } else {
763                     $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x30 : 0x70);
764                     $c2 -= 2;
765                 }
766                 $_E2S{$str} = pack('CC', $c1, $c2);
767             }
768         }
769         $_E2S{$str};
770     }geox;
771     $$r_str;
772 }
773
774 #
775 # Util. Functions
776 #
777
778 sub _max {
779     my $result = shift;
780     for my $n (@_){
781         $result = $n if $n > $result;
782     }
783     return $result;
784 }
785
786 1;
787
788 __END__
789
790 =head1 BUGS
791
792 Unicode support by Jcode is far from efficient!
793
794 =head1 IN FUTURE
795
796 Hopefully Jcode will be superceded by Encode module that is part of
797 the standard module on Perl 5.7 and up
798
799 =head1 ACKNOWLEDGEMENTS
800
801 This package owes a lot in motivation, design, and code, to the jcode.pl
802 for Perl4 by Kazumasa Utashiro <utashiro@iij.ad.jp>.
803
804 Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp> has helped me polish regexp from the
805 very first stage of development.
806
807 And folks at Jcode Mailing list <jcode5@ring.gr.jp>.  Without them, I
808 couldn't have coded this far.
809
810 =head1 SEE ALSO
811
812 L<Jcode::Unicode>
813
814 L<Jcode::Unicode::NoXS>
815
816 http://www.iana.org/assignments/character-sets
817
818 L<Encode>
819
820 =head1 COPYRIGHT
821
822 Copyright 1999 Dan Kogai <dankogai@dan.co.jp>
823
824 This library is free software; you can redistribute it
825 and/or modify it under the same terms as Perl itself.
826
827 =cut
828
Note: See TracBrowser for help on using the browser.