[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 2 require 5; 3 # The documentation is at the end. 4 # Time-stamp: "2004-05-07 15:31:25 ADT" 5 package Pod::Escapes; 6 require Exporter; 7 @ISA = ('Exporter'); 8 $VERSION = '1.04'; 9 @EXPORT_OK = qw( 10 %Code2USASCII 11 %Name2character 12 %Name2character_number 13 %Latin1Code_to_fallback 14 %Latin1Char_to_fallback 15 e2char 16 e2charnum 17 ); 18 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); 19 20 #========================================================================== 21 22 use strict; 23 use vars qw( 24 %Code2USASCII 25 %Name2character 26 %Name2character_number 27 %Latin1Code_to_fallback 28 %Latin1Char_to_fallback 29 $FAR_CHAR 30 $FAR_CHAR_NUMBER 31 $NOT_ASCII 32 ); 33 34 $FAR_CHAR = "?" unless defined $FAR_CHAR; 35 $FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER; 36 37 $NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII; 38 39 #-------------------------------------------------------------------------- 40 sub e2char { 41 my $in = $_[0]; 42 return undef unless defined $in and length $in; 43 44 # Convert to decimal: 45 if($in =~ m/^(0[0-7]*)$/s ) { 46 $in = oct $in; 47 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { 48 $in = hex $1; 49 } # else it's decimal, or named 50 51 if($NOT_ASCII) { 52 # We're in bizarro world of not-ASCII! 53 # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR. 54 unless($in =~ m/^\d+$/s) { 55 # It's a named character reference. Get its numeric Unicode value. 56 $in = $Name2character{$in}; 57 return undef unless defined $in; # (if there's no such name) 58 $in = ord $in; # (All ents must be one character long.) 59 # ...So $in holds the char's US-ASCII numeric value, which we'll 60 # now go get the local equivalent for. 61 } 62 63 # It's numeric, whether by origin or by mutation from a known name 64 return $Code2USASCII{$in} # so "65" => "A" everywhere 65 || $Latin1Code_to_fallback{$in} # Fallback. 66 || $FAR_CHAR; # Fall further back 67 } 68 69 # Normal handling: 70 if($in =~ m/^\d+$/s) { 71 if($] < 5.007 and $in > 255) { # can't be trusted with Unicode 72 return $FAR_CHAR; 73 } else { 74 return chr($in); 75 } 76 } else { 77 return $Name2character{$in}; # returns undef if unknown 78 } 79 } 80 81 #-------------------------------------------------------------------------- 82 sub e2charnum { 83 my $in = $_[0]; 84 return undef unless defined $in and length $in; 85 86 # Convert to decimal: 87 if($in =~ m/^(0[0-7]*)$/s ) { 88 $in = oct $in; 89 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { 90 $in = hex $1; 91 } # else it's decimal, or named 92 93 if($in =~ m/^\d+$/s) { 94 return 0 + $in; 95 } else { 96 return $Name2character_number{$in}; # returns undef if unknown 97 } 98 } 99 100 #-------------------------------------------------------------------------- 101 102 %Name2character_number = ( 103 # General XML/XHTML: 104 'lt' => 60, 105 'gt' => 62, 106 'quot' => 34, 107 'amp' => 38, 108 'apos' => 39, 109 110 # POD-specific: 111 'sol' => 47, 112 'verbar' => 124, 113 114 'lchevron' => 171, # legacy for laquo 115 'rchevron' => 187, # legacy for raquo 116 117 # Remember, grave looks like \ (as in virtu\) 118 # acute looks like / (as in re/sume/) 119 # circumflex looks like ^ (as in papier ma^che/) 120 # umlaut/dieresis looks like " (as in nai"ve, Chloe") 121 122 # From the XHTML 1 .ent files: 123 'nbsp' , 160, 124 'iexcl' , 161, 125 'cent' , 162, 126 'pound' , 163, 127 'curren' , 164, 128 'yen' , 165, 129 'brvbar' , 166, 130 'sect' , 167, 131 'uml' , 168, 132 'copy' , 169, 133 'ordf' , 170, 134 'laquo' , 171, 135 'not' , 172, 136 'shy' , 173, 137 'reg' , 174, 138 'macr' , 175, 139 'deg' , 176, 140 'plusmn' , 177, 141 'sup2' , 178, 142 'sup3' , 179, 143 'acute' , 180, 144 'micro' , 181, 145 'para' , 182, 146 'middot' , 183, 147 'cedil' , 184, 148 'sup1' , 185, 149 'ordm' , 186, 150 'raquo' , 187, 151 'frac14' , 188, 152 'frac12' , 189, 153 'frac34' , 190, 154 'iquest' , 191, 155 'Agrave' , 192, 156 'Aacute' , 193, 157 'Acirc' , 194, 158 'Atilde' , 195, 159 'Auml' , 196, 160 'Aring' , 197, 161 'AElig' , 198, 162 'Ccedil' , 199, 163 'Egrave' , 200, 164 'Eacute' , 201, 165 'Ecirc' , 202, 166 'Euml' , 203, 167 'Igrave' , 204, 168 'Iacute' , 205, 169 'Icirc' , 206, 170 'Iuml' , 207, 171 'ETH' , 208, 172 'Ntilde' , 209, 173 'Ograve' , 210, 174 'Oacute' , 211, 175 'Ocirc' , 212, 176 'Otilde' , 213, 177 'Ouml' , 214, 178 'times' , 215, 179 'Oslash' , 216, 180 'Ugrave' , 217, 181 'Uacute' , 218, 182 'Ucirc' , 219, 183 'Uuml' , 220, 184 'Yacute' , 221, 185 'THORN' , 222, 186 'szlig' , 223, 187 'agrave' , 224, 188 'aacute' , 225, 189 'acirc' , 226, 190 'atilde' , 227, 191 'auml' , 228, 192 'aring' , 229, 193 'aelig' , 230, 194 'ccedil' , 231, 195 'egrave' , 232, 196 'eacute' , 233, 197 'ecirc' , 234, 198 'euml' , 235, 199 'igrave' , 236, 200 'iacute' , 237, 201 'icirc' , 238, 202 'iuml' , 239, 203 'eth' , 240, 204 'ntilde' , 241, 205 'ograve' , 242, 206 'oacute' , 243, 207 'ocirc' , 244, 208 'otilde' , 245, 209 'ouml' , 246, 210 'divide' , 247, 211 'oslash' , 248, 212 'ugrave' , 249, 213 'uacute' , 250, 214 'ucirc' , 251, 215 'uuml' , 252, 216 'yacute' , 253, 217 'thorn' , 254, 218 'yuml' , 255, 219 220 'fnof' , 402, 221 'Alpha' , 913, 222 'Beta' , 914, 223 'Gamma' , 915, 224 'Delta' , 916, 225 'Epsilon' , 917, 226 'Zeta' , 918, 227 'Eta' , 919, 228 'Theta' , 920, 229 'Iota' , 921, 230 'Kappa' , 922, 231 'Lambda' , 923, 232 'Mu' , 924, 233 'Nu' , 925, 234 'Xi' , 926, 235 'Omicron' , 927, 236 'Pi' , 928, 237 'Rho' , 929, 238 'Sigma' , 931, 239 'Tau' , 932, 240 'Upsilon' , 933, 241 'Phi' , 934, 242 'Chi' , 935, 243 'Psi' , 936, 244 'Omega' , 937, 245 'alpha' , 945, 246 'beta' , 946, 247 'gamma' , 947, 248 'delta' , 948, 249 'epsilon' , 949, 250 'zeta' , 950, 251 'eta' , 951, 252 'theta' , 952, 253 'iota' , 953, 254 'kappa' , 954, 255 'lambda' , 955, 256 'mu' , 956, 257 'nu' , 957, 258 'xi' , 958, 259 'omicron' , 959, 260 'pi' , 960, 261 'rho' , 961, 262 'sigmaf' , 962, 263 'sigma' , 963, 264 'tau' , 964, 265 'upsilon' , 965, 266 'phi' , 966, 267 'chi' , 967, 268 'psi' , 968, 269 'omega' , 969, 270 'thetasym' , 977, 271 'upsih' , 978, 272 'piv' , 982, 273 'bull' , 8226, 274 'hellip' , 8230, 275 'prime' , 8242, 276 'Prime' , 8243, 277 'oline' , 8254, 278 'frasl' , 8260, 279 'weierp' , 8472, 280 'image' , 8465, 281 'real' , 8476, 282 'trade' , 8482, 283 'alefsym' , 8501, 284 'larr' , 8592, 285 'uarr' , 8593, 286 'rarr' , 8594, 287 'darr' , 8595, 288 'harr' , 8596, 289 'crarr' , 8629, 290 'lArr' , 8656, 291 'uArr' , 8657, 292 'rArr' , 8658, 293 'dArr' , 8659, 294 'hArr' , 8660, 295 'forall' , 8704, 296 'part' , 8706, 297 'exist' , 8707, 298 'empty' , 8709, 299 'nabla' , 8711, 300 'isin' , 8712, 301 'notin' , 8713, 302 'ni' , 8715, 303 'prod' , 8719, 304 'sum' , 8721, 305 'minus' , 8722, 306 'lowast' , 8727, 307 'radic' , 8730, 308 'prop' , 8733, 309 'infin' , 8734, 310 'ang' , 8736, 311 'and' , 8743, 312 'or' , 8744, 313 'cap' , 8745, 314 'cup' , 8746, 315 'int' , 8747, 316 'there4' , 8756, 317 'sim' , 8764, 318 'cong' , 8773, 319 'asymp' , 8776, 320 'ne' , 8800, 321 'equiv' , 8801, 322 'le' , 8804, 323 'ge' , 8805, 324 'sub' , 8834, 325 'sup' , 8835, 326 'nsub' , 8836, 327 'sube' , 8838, 328 'supe' , 8839, 329 'oplus' , 8853, 330 'otimes' , 8855, 331 'perp' , 8869, 332 'sdot' , 8901, 333 'lceil' , 8968, 334 'rceil' , 8969, 335 'lfloor' , 8970, 336 'rfloor' , 8971, 337 'lang' , 9001, 338 'rang' , 9002, 339 'loz' , 9674, 340 'spades' , 9824, 341 'clubs' , 9827, 342 'hearts' , 9829, 343 'diams' , 9830, 344 'OElig' , 338, 345 'oelig' , 339, 346 'Scaron' , 352, 347 'scaron' , 353, 348 'Yuml' , 376, 349 'circ' , 710, 350 'tilde' , 732, 351 'ensp' , 8194, 352 'emsp' , 8195, 353 'thinsp' , 8201, 354 'zwnj' , 8204, 355 'zwj' , 8205, 356 'lrm' , 8206, 357 'rlm' , 8207, 358 'ndash' , 8211, 359 'mdash' , 8212, 360 'lsquo' , 8216, 361 'rsquo' , 8217, 362 'sbquo' , 8218, 363 'ldquo' , 8220, 364 'rdquo' , 8221, 365 'bdquo' , 8222, 366 'dagger' , 8224, 367 'Dagger' , 8225, 368 'permil' , 8240, 369 'lsaquo' , 8249, 370 'rsaquo' , 8250, 371 'euro' , 8364, 372 ); 373 374 375 # Fill out %Name2character... 376 { 377 %Name2character = (); 378 my($name, $number); 379 while( ($name, $number) = each %Name2character_number) { 380 if($] < 5.007 and $number > 255) { 381 $Name2character{$name} = $FAR_CHAR; 382 # substitute for Unicode characters, for perls 383 # that can't reliable handle them 384 } else { 385 $Name2character{$name} = chr $number; 386 # normal case 387 } 388 } 389 # So they resolve 'right' even in EBCDIC-land 390 $Name2character{'lt' } = '<'; 391 $Name2character{'gt' } = '>'; 392 $Name2character{'quot'} = '"'; 393 $Name2character{'amp' } = '&'; 394 $Name2character{'apos'} = "'"; 395 $Name2character{'sol' } = '/'; 396 $Name2character{'verbar'} = '|'; 397 } 398 399 #-------------------------------------------------------------------------- 400 401 %Code2USASCII = ( 402 # mostly generated by 403 # perl -e "printf qq{ \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)" 404 32, ' ', 405 33, '!', 406 34, '"', 407 35, '#', 408 36, '$', 409 37, '%', 410 38, '&', 411 39, "'", #! 412 40, '(', 413 41, ')', 414 42, '*', 415 43, '+', 416 44, ',', 417 45, '-', 418 46, '.', 419 47, '/', 420 48, '0', 421 49, '1', 422 50, '2', 423 51, '3', 424 52, '4', 425 53, '5', 426 54, '6', 427 55, '7', 428 56, '8', 429 57, '9', 430 58, ':', 431 59, ';', 432 60, '<', 433 61, '=', 434 62, '>', 435 63, '?', 436 64, '@', 437 65, 'A', 438 66, 'B', 439 67, 'C', 440 68, 'D', 441 69, 'E', 442 70, 'F', 443 71, 'G', 444 72, 'H', 445 73, 'I', 446 74, 'J', 447 75, 'K', 448 76, 'L', 449 77, 'M', 450 78, 'N', 451 79, 'O', 452 80, 'P', 453 81, 'Q', 454 82, 'R', 455 83, 'S', 456 84, 'T', 457 85, 'U', 458 86, 'V', 459 87, 'W', 460 88, 'X', 461 89, 'Y', 462 90, 'Z', 463 91, '[', 464 92, "\\", #! 465 93, ']', 466 94, '^', 467 95, '_', 468 96, '`', 469 97, 'a', 470 98, 'b', 471 99, 'c', 472 100, 'd', 473 101, 'e', 474 102, 'f', 475 103, 'g', 476 104, 'h', 477 105, 'i', 478 106, 'j', 479 107, 'k', 480 108, 'l', 481 109, 'm', 482 110, 'n', 483 111, 'o', 484 112, 'p', 485 113, 'q', 486 114, 'r', 487 115, 's', 488 116, 't', 489 117, 'u', 490 118, 'v', 491 119, 'w', 492 120, 'x', 493 121, 'y', 494 122, 'z', 495 123, '{', 496 124, '|', 497 125, '}', 498 126, '~', 499 ); 500 501 #-------------------------------------------------------------------------- 502 503 %Latin1Code_to_fallback = (); 504 @Latin1Code_to_fallback{0xA0 .. 0xFF} = ( 505 # Copied from Text/Unidecode/x00.pm: 506 507 ' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-}, 508 'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?}, 509 'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 510 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss', 511 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 512 'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y', 513 514 ); 515 516 { 517 # Now stuff %Latin1Char_to_fallback: 518 %Latin1Char_to_fallback = (); 519 my($k,$v); 520 while( ($k,$v) = each %Latin1Code_to_fallback) { 521 $Latin1Char_to_fallback{chr $k} = $v; 522 #print chr($k), ' => ', $v, "\n"; 523 } 524 } 525 526 #-------------------------------------------------------------------------- 527 1; 528 __END__ 529 530 =head1 NAME 531 532 Pod::Escapes -- for resolving Pod EE<lt>...E<gt> sequences 533 534 =head1 SYNOPSIS 535 536 use Pod::Escapes qw(e2char); 537 ...la la la, parsing POD, la la la... 538 $text = e2char($e_node->label); 539 unless(defined $text) { 540 print "Unknown E sequence \"", $e_node->label, "\"!"; 541 } 542 ...else print/interpolate $text... 543 544 =head1 DESCRIPTION 545 546 This module provides things that are useful in decoding 547 Pod EE<lt>...E<gt> sequences. Presumably, it should be used 548 only by Pod parsers and/or formatters. 549 550 By default, Pod::Escapes exports none of its symbols. But 551 you can request any of them to be exported. 552 Either request them individually, as with 553 C<use Pod::Escapes qw(symbolname symbolname2...);>, 554 or you can do C<use Pod::Escapes qw(:ALL);> to get all 555 exportable symbols. 556 557 =head1 GOODIES 558 559 =over 560 561 =item e2char($e_content) 562 563 Given a name or number that could appear in a 564 C<EE<lt>name_or_numE<gt>> sequence, this returns the string that 565 it stands for. For example, C<e2char('sol')>, C<e2char('47')>, 566 C<e2char('0x2F')>, and C<e2char('057')> all return "/", 567 because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>, 568 and C<EE<lt>057E<gt>>, all mean "/". If 569 the name has no known value (as with a name of "qacute") or is 570 syntactally invalid (as with a name of "1/4"), this returns undef. 571 572 =item e2charnum($e_content) 573 574 Given a name or number that could appear in a 575 C<EE<lt>name_or_numE<gt>> sequence, this returns the number of 576 the Unicode character that this stands for. For example, 577 C<e2char('sol')>, C<e2char('47')>, 578 C<e2char('0x2F')>, and C<e2char('057')> all return 47, 579 because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>, 580 and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47. If 581 the name has no known value (as with a name of "qacute") or is 582 syntactally invalid (as with a name of "1/4"), this returns undef. 583 584 =item $Name2character{I<name>} 585 586 Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol" 587 to the string that each stands for. Note that this does not 588 include numerics (like "64" or "x981c"). Under old Perl versions 589 (before 5.7) you get a "?" in place of characters whose Unicode 590 value is over 255. 591 592 =item $Name2character_number{I<name>} 593 594 Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol" 595 to the Unicode value that each stands for. For example, 596 C<$Name2character_number{'eacute'}> is 201, and 597 C<$Name2character_number{'eacute'}> is 8364. You get the correct 598 Unicode value, regardless of the version of Perl you're using -- 599 which differs from C<%Name2character>'s behavior under pre-5.7 Perls. 600 601 Note that this hash does not 602 include numerics (like "64" or "x981c"). 603 604 =item $Latin1Code_to_fallback{I<integer>} 605 606 For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps 607 from the character code for a Latin-1 character (like 233 for 608 lowercase e-acute) to the US-ASCII character that best aproximates 609 it (like "e"). You may find this useful if you are rendering 610 POD in a format that you think deals well only with US-ASCII 611 characters. 612 613 =item $Latin1Char_to_fallback{I<character>} 614 615 Just as above, but maps from characters (like "\xE9", 616 lowercase e-acute) to characters (like "e"). 617 618 =item $Code2USASCII{I<integer>} 619 620 This maps from US-ASCII codes (like 32) to the corresponding 621 character (like space, for 32). Only characters 32 to 126 are 622 defined. This is meant for use by C<e2char($x)> when it senses 623 that it's running on a non-ASCII platform (where chr(32) doesn't 624 get you a space -- but $Code2USASCII{32} will). It's 625 documented here just in case you might find it useful. 626 627 =back 628 629 =head1 CAVEATS 630 631 On Perl versions before 5.7, Unicode characters with a value 632 over 255 (like lambda or emdash) can't be conveyed. This 633 module does work under such early Perl versions, but in the 634 place of each such character, you get a "?". Latin-1 635 characters (characters 160-255) are unaffected. 636 637 Under EBCDIC platforms, C<e2char($n)> may not always be the 638 same as C<chr(e2charnum($n))>, and ditto for 639 C<$Name2character{$name}> and 640 C<chr($Name2character_number{$name})>. 641 642 =head1 SEE ALSO 643 644 L<perlpod|perlpod> 645 646 L<perlpodspec|perlpodspec> 647 648 L<Text::Unidecode|Text::Unidecode> 649 650 =head1 COPYRIGHT AND DISCLAIMERS 651 652 Copyright (c) 2001-2004 Sean M. Burke. All rights reserved. 653 654 This library is free software; you can redistribute it and/or modify 655 it under the same terms as Perl itself. 656 657 This program is distributed in the hope that it will be useful, but 658 without any warranty; without even the implied warranty of 659 merchantability or fitness for a particular purpose. 660 661 Portions of the data tables in this module are derived from the 662 entity declarations in the W3C XHTML specification. 663 664 Currently (October 2001), that's these three: 665 666 http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent 667 http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent 668 http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent 669 670 =head1 AUTHOR 671 672 Sean M. Burke C<sburke@cpan.org> 673 674 =cut 675 676 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 677 # What I used for reading the XHTML .ent files: 678 679 use strict; 680 my(@norms, @good, @bad); 681 my $dir = 'c:/sgml/docbook/'; 682 my %escapes; 683 foreach my $file (qw( 684 xhtml-symbol.ent 685 xhtml-lat1.ent 686 xhtml-special.ent 687 )) { 688 open(IN, "<$dir$file") or die "can't read-open $dir$file: $!"; 689 print "Reading $file...\n"; 690 while(<IN>) { 691 if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) { 692 my($name, $value) = ($1,$2); 693 next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt'; 694 695 $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s; 696 print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s; 697 if($value > 255) { 698 push @good , sprintf " %-10s , chr(%s),\n", "'$name'", $value; 699 push @bad , sprintf " %-10s , \$bad,\n", "'$name'", $value; 700 } else { 701 push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value; 702 } 703 } elsif(m/<!ENT/) { 704 print "# Skipping $_"; 705 } 706 707 } 708 close(IN); 709 } 710 711 print @norms; 712 print "\n ( \$] .= 5.006001 ? (\n"; 713 print @good; 714 print " ) : (\n"; 715 print @bad; 716 print " )\n);\n"; 717 718 __END__ 719 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 720 721
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Tue Mar 17 22:47:18 2015 | Cross-referenced by PHPXref 0.7.1 |