From 51f0f8ed69b065cd2577a27c2a2911cdda243b99 Mon Sep 17 00:00:00 2001 From: Victor Wagner Date: Fri, 24 Feb 2006 17:44:06 +0000 Subject: [PATCH] Recreated CVS repository from working copy --- .cvsignore | 15 + CODING.STD | 72 ++ COPYING | 340 +++++++ CREDITS | 35 + INSTALL | 71 ++ INSTALL.dos | 76 ++ Makefile.in | 26 + NEWS | 67 ++ README | 45 + TODO | 11 + acconfig.h | 2 + charsets/.cvsignore | 2 + charsets/8859-1.txt | 230 +++++ charsets/8859-10.txt | 303 ++++++ charsets/8859-11.txt | 297 ++++++ charsets/8859-13.txt | 299 ++++++ charsets/8859-14.txt | 301 ++++++ charsets/8859-15.txt | 303 ++++++ charsets/8859-2.txt | 230 +++++ charsets/8859-3.txt | 223 ++++ charsets/8859-4.txt | 230 +++++ charsets/8859-5.txt | 230 +++++ charsets/8859-6.txt | 185 ++++ charsets/8859-7.txt | 224 ++++ charsets/8859-8.txt | 192 ++++ charsets/8859-9.txt | 232 +++++ charsets/Makefile.in | 43 + charsets/ascii.rpl | 162 +++ charsets/ascii.spc | 3 + charsets/cp1250.txt | 274 +++++ charsets/cp1251.txt | 274 +++++ charsets/cp1252.txt | 274 +++++ charsets/cp1253.txt | 274 +++++ charsets/cp1254.txt | 274 +++++ charsets/cp1255.txt | 274 +++++ charsets/cp1256.txt | 274 +++++ charsets/cp1257.txt | 274 +++++ charsets/cp1258.txt | 274 +++++ charsets/cp437.txt | 274 +++++ charsets/cp850.txt | 274 +++++ charsets/cp852.txt | 274 +++++ charsets/cp855.txt | 275 +++++ charsets/cp857.txt | 275 +++++ charsets/cp860.txt | 275 +++++ charsets/cp861.txt | 275 +++++ charsets/cp862.txt | 275 +++++ charsets/cp863.txt | 275 +++++ charsets/cp864.txt | 275 +++++ charsets/cp865.txt | 275 +++++ charsets/cp866.txt | 275 +++++ charsets/cp869.txt | 275 +++++ charsets/cp874.txt | 274 +++++ charsets/koi8-r.txt | 257 +++++ charsets/koi8-u.txt | 224 ++++ charsets/mac-arabic.txt | 536 ++++++++++ charsets/mac-centeuro.txt | 327 ++++++ charsets/mac-cyrillic.txt | 347 +++++++ charsets/mac-greek.txt | 355 +++++++ charsets/mac-hebrew.txt | 601 +++++++++++ charsets/mac-roman.txt | 370 +++++++ charsets/tex.rpl | 94 ++ charsets/tex.spc | 18 + charsets/us-ascii.txt | 98 ++ compat/.cvsignore | 2 + compat/langinfo.c | 25 + compat/langinfo.h | 7 + compat/strftime.c | 70 ++ compat/strftime.h | 9 + compat/unistd.h | 9 + configure | 2022 +++++++++++++++++++++++++++++++++++++ configure.in | 128 +++ doc/.cvsignore | 8 + doc/Makefile.in | 38 + doc/catdoc.1.in | 314 ++++++ doc/catppt.1.in | 58 ++ doc/wordview.1.in | 92 ++ doc/xls2csv.1.in | 101 ++ install-sh | 250 +++++ missing | 188 ++++ mkinstalldirs | 40 + src/.cvsignore | 18 + src/Makefile.in | 103 ++ src/analyze.c | 175 ++++ src/catdoc.c | 192 ++++ src/catdoc.h | 236 +++++ src/catdoc.rsp | 3 + src/catppt.c | 157 +++ src/charsets.c | 302 ++++++ src/config.h.in | 47 + src/confutil.c | 171 ++++ src/fileutil.c | 265 +++++ src/makefile.tc | 25 + src/numutils.c | 29 + src/ole.c | 626 ++++++++++++ src/ole.h | 67 ++ src/ppt.h | 22 + src/pptparse.c | 286 ++++++ src/ppttypes.h | 63 ++ src/reader.c | 224 ++++ src/rtfread.c | 476 +++++++++ src/sheet.c | 148 +++ src/substmap.c | 170 ++++ src/wordview.tcl | 282 ++++++ src/writer.c | 87 ++ src/xls.h | 53 + src/xls2csv.c | 180 ++++ src/xlsparse.c | 777 ++++++++++++++ src/xltypes.h | 159 +++ stamp-h | 1 + 109 files changed, 22693 insertions(+) create mode 100644 .cvsignore create mode 100644 CODING.STD create mode 100644 COPYING create mode 100644 CREDITS create mode 100644 INSTALL create mode 100644 INSTALL.dos create mode 100644 Makefile.in create mode 100644 NEWS create mode 100644 README create mode 100644 TODO create mode 100644 acconfig.h create mode 100644 charsets/.cvsignore create mode 100644 charsets/8859-1.txt create mode 100644 charsets/8859-10.txt create mode 100644 charsets/8859-11.txt create mode 100644 charsets/8859-13.txt create mode 100644 charsets/8859-14.txt create mode 100644 charsets/8859-15.txt create mode 100644 charsets/8859-2.txt create mode 100644 charsets/8859-3.txt create mode 100644 charsets/8859-4.txt create mode 100644 charsets/8859-5.txt create mode 100644 charsets/8859-6.txt create mode 100644 charsets/8859-7.txt create mode 100644 charsets/8859-8.txt create mode 100644 charsets/8859-9.txt create mode 100644 charsets/Makefile.in create mode 100644 charsets/ascii.rpl create mode 100644 charsets/ascii.spc create mode 100644 charsets/cp1250.txt create mode 100644 charsets/cp1251.txt create mode 100644 charsets/cp1252.txt create mode 100644 charsets/cp1253.txt create mode 100644 charsets/cp1254.txt create mode 100644 charsets/cp1255.txt create mode 100644 charsets/cp1256.txt create mode 100644 charsets/cp1257.txt create mode 100644 charsets/cp1258.txt create mode 100644 charsets/cp437.txt create mode 100644 charsets/cp850.txt create mode 100644 charsets/cp852.txt create mode 100644 charsets/cp855.txt create mode 100644 charsets/cp857.txt create mode 100644 charsets/cp860.txt create mode 100644 charsets/cp861.txt create mode 100644 charsets/cp862.txt create mode 100644 charsets/cp863.txt create mode 100644 charsets/cp864.txt create mode 100644 charsets/cp865.txt create mode 100644 charsets/cp866.txt create mode 100644 charsets/cp869.txt create mode 100644 charsets/cp874.txt create mode 100644 charsets/koi8-r.txt create mode 100644 charsets/koi8-u.txt create mode 100644 charsets/mac-arabic.txt create mode 100644 charsets/mac-centeuro.txt create mode 100644 charsets/mac-cyrillic.txt create mode 100644 charsets/mac-greek.txt create mode 100644 charsets/mac-hebrew.txt create mode 100644 charsets/mac-roman.txt create mode 100644 charsets/tex.rpl create mode 100644 charsets/tex.spc create mode 100644 charsets/us-ascii.txt create mode 100644 compat/.cvsignore create mode 100644 compat/langinfo.c create mode 100644 compat/langinfo.h create mode 100644 compat/strftime.c create mode 100644 compat/strftime.h create mode 100644 compat/unistd.h create mode 100755 configure create mode 100644 configure.in create mode 100644 doc/.cvsignore create mode 100644 doc/Makefile.in create mode 100644 doc/catdoc.1.in create mode 100644 doc/catppt.1.in create mode 100644 doc/wordview.1.in create mode 100644 doc/xls2csv.1.in create mode 100755 install-sh create mode 100755 missing create mode 100755 mkinstalldirs create mode 100644 src/.cvsignore create mode 100644 src/Makefile.in create mode 100644 src/analyze.c create mode 100644 src/catdoc.c create mode 100644 src/catdoc.h create mode 100644 src/catdoc.rsp create mode 100644 src/catppt.c create mode 100644 src/charsets.c create mode 100644 src/config.h.in create mode 100644 src/confutil.c create mode 100644 src/fileutil.c create mode 100644 src/makefile.tc create mode 100644 src/numutils.c create mode 100644 src/ole.c create mode 100644 src/ole.h create mode 100644 src/ppt.h create mode 100644 src/pptparse.c create mode 100644 src/ppttypes.h create mode 100644 src/reader.c create mode 100644 src/rtfread.c create mode 100644 src/sheet.c create mode 100644 src/substmap.c create mode 100755 src/wordview.tcl create mode 100644 src/writer.c create mode 100644 src/xls.h create mode 100644 src/xls2csv.c create mode 100644 src/xlsparse.c create mode 100644 src/xltypes.h create mode 100644 stamp-h diff --git a/.cvsignore b/.cvsignore new file mode 100644 index 0000000..a7fbd2a --- /dev/null +++ b/.cvsignore @@ -0,0 +1,15 @@ +.arch-ids +.arch-ids +.arch-inventory +.arch-ids +.arch-ids +.arch-inventory +Makefile +autom4te.cache +config.cache +config.log +config.status +rtf-parser.txt +semantic.cache +tests +{arch} diff --git a/CODING.STD b/CODING.STD new file mode 100644 index 0000000..254b22a --- /dev/null +++ b/CODING.STD @@ -0,0 +1,72 @@ +CATDOC CODING STANDARD +~~~~~~~~~~~~~~~~~~~~~~ +0. CATDOC ISN'T WRITTEN ON C++!!! + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + C and C++ are different languages. + No // comments, no references, no declaration in the middle of block. + +1. Catdoc is portable program. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Please never make following assumptions: +1. That int is more than 16-bit wide + (consequentually, that signed int can hold Unicode character) +2. That sizeof(int)>=sizeof(int *) +3. That int is always 16-bit (it can be 32 bit as well) +4. That long is 32-bit +5. That char (and int and short as well) is either signed or unsigned + Always use explicit signedness specifier +6. That integer arithmetic is 32-bit long. +7. That input is always seekable. Catdoc is often used as filter +8. That filenames are either case-sensitive or case-insensitive +9. That there is no difference between binary and text file opening mode +10. That opening file in the text mode will do something reasonable. + Always open files in binary mode. This is only way to produce + results, consistent on all platforms. +11. That you can rely on compiler POSIX or C99 compliance. If you need + to use some function defined by this standard, write configure test + and provide fallback. +12. That you can allocate chunk of memory larger than 64K. +13. That filenames can be longer that 8+3. + +2. Catdoc is used world-wide +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. Never write comments on languages other than English. +2. Never assume that you can output character without passing it through + convert_char function. + +3. Code formatting +~~~~~~~~~~~~~~~~~ +1. Use for identation. If your text editor insists on being + 8 char, consider using some other editor. vim is at least a bit more + portable than catdoc. +2. Open curly bracket on the same line as statement it belongs to: + if (condition) { + code + } + rather than + if (condition) + { + code + } + +3. The only exeception from rule 2 are blocks in the switch statement: + switch (var) { + case value: + { + code + } + } + rather than + switch (var) { + case value: { + code + } + } + +4. Write comments at the start of each function describing its purpose + and arguments. + +5. If you use some potentially dangerous construct, such as sprintf on + static buffer, comment why it is safe in this particular case. + diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d60c31a --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/CREDITS b/CREDITS new file mode 100644 index 0000000..f573fdf --- /dev/null +++ b/CREDITS @@ -0,0 +1,35 @@ +Note: people listed in this file are listed in arbitrary order. +Kawai Takanori (Hippo2000) kwitknr@cpan.org + Author of perl module Spreadsheet::ParseExcel, which I use as + reference manual for Excel format +Alex Ott + Fixed handling of long SST, contributed handling of RK records, + wrote RTF and OLE parsers +Pawel Wiecek + Current maintainer of Debian catdoc packag +Peter Novodvosky + maintained debian package for catdoc. +Bjorn Brenander + maintained debian package for catdoc. +Eugene B. Byrganov + Suggested -l switch, found me an example of partly 8-bit/partly + 16-bit file and some typos in builtin docs. Fixed some long-standing + bugs in config-parsed code. +Artem Chuprina + Provided lot of bugfixes and suggestions. Also maintained some + unofficial packaged versions of catdoc. +Stephen Farrell + maintains FreeBSD port, and have persuaded me to write autoconf + configuration +Martin Kraemer + contributed some fixes for ascii.rpl and noted typo in catdoc.h +Arfst Ludwig + give me the idea of creating README.charset +Dmitry Potapov + contributed rtf-parsing code +David Rysdam + Wrote program biffview, which parses XLS file and used as base + for xls2csv. +Duncan Simpson + audited catdoc code for possible buffer overruns (and found much more + of them than actually existed) diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..4b98e06 --- /dev/null +++ b/INSTALL @@ -0,0 +1,71 @@ +INSTALLING catdoc 0.91.x + +Starting with patchlevel alpha 3 catdoc version 0.90 have autoconf +configuration. Thanks for Stephen Farell to convince me. + +So typically you should run +./configure +make +make install + +to compile and install catdoc. + +NOTE for HPUX users. If you want to compile catdoc with aCC, +use CC="aCC -Ae" ./configure + +Configure script for catdoc recognizes following options (apart from +standard --prefix, --exec-prefix and so on) + +--disable-wordview - disables building of Tcl/Tk viewer wordview, + which requires X11. (note, it would be disabled automatically, + if you don't have appropriate version of Tcl/Tk). You may + wish to use this if you don't have X installed. + +--with-wish=path - specifies path to wish interpreter. This option have + two uses + 1. If executable named wish, found in your PATH is old, and + you have newer wish installed as wish4.2 or wish8.0, + you should specify this in order to build wordview viewer + 2. If you are compiling catdoc from telnet connection or + text console, you can specify this option to skip tcl + version check, which would run wish and fail if it couldn't + find X display (which would lead configure to assume, that + you don't have good wish) + +--with-input=charset +--with-output=charset + Allows you to specify charset names to expect in 8-bit word + file and to produce as output text file. Do ls ./charsets/*.txt + to find out which charsets are provided in distribution. + Additional charsets can be obtained from + ftp.unicode.org + Note that make would fail if you specify charset, which + doesn't exist in charset directory. + +--disable-charset-check + By default, make in charsets directory fails, if it is unable + to find *.txt files corresponding to default input and output + charsets. This option allows you to disable this check. Make + in charsets directory would always succeed, but it is your + responsibility to provide charset files in catdoc library + directory after make install. +--disable-langinfo + By default, catdoc tries to use your current locale charset + as its output charset. It can be, of cource always overriden + by command line switch. But charset from the locale takes + precedence over charset in configuration file, unless + you put use_locale=no into this file. + + If your C library is not XPG4-compatible, and configure fails + to detect it, you can completely disable langinfo support + using this switch. + +If you experience strange and unexpected behavoir of catdoc, try to +remove optimization flag (-02) from FLAGS in src/Makefile. +If you can write autoconf test to check for this problem, please send it +to me. + +It was known problem with version 0.35 on HP/UX 9, and I scarcely changed +my style of writing since. + + diff --git a/INSTALL.dos b/INSTALL.dos new file mode 100644 index 0000000..cd5519a --- /dev/null +++ b/INSTALL.dos @@ -0,0 +1,76 @@ +INSTALLING catdoc 0.90a on MS-DOS system. + +Surprise, but MS-DOS is native platform for this version of catdoc. +In difference of previous version, which was UNIX program, ported to +DOS, this one was developed under DOS on nine-years old 286 laptop +with Turbo C 2.0. + +So, catdoc works perfectly well on MS-DOS systems. + +Documentation can be found in files CATDOC.TXT and CATDOC.PS +(both produced by UNIX man command) + +If you've fetched BINARY DISTRIBUTION, note following: + +1. catdoc expect to find its system-wide configuration file + in the same directory as executable (and therefore require DOS + version 3 or above) If you wish to move charset and special char + maps to location other than default (charsets subdirectory of + directory, containing executable) you must have this configuration + file. + +2. Any file name in configuration file can contain %s escape, which + would be substituted by directory of executable. + +3. All configuration files can use either DOS or UNIX end-of-line + convention. + +4. Per-user configuration probably wouldn't work. But try to define + environment variable HOME and put catdoc.rc file in directory, + pointed by it. + +5. Catdoc uses DOS country information as specified by COUNTRY statement + in your configuration file to determine output encoding. This + settings have priority over settings in configuration files (either + per-user or system-wide). If it is not what you want, set + use_locale = no in the configuration file. + +If you are insisting on COMPILING catdoc YOURSELF. +Please note that catdoc was compiled under DOS using Turbo C 2.01, +downloaded from http://community.borland.com/museum. You can get the +same one. + +I've made some attempts to compile catdoc with Watcom C (16-bit), +but haven't completely socceeded. If you do, let me know. + +1. With 16-bit compilier, use COMPACT memory model + If you are using Turbo C make -fmakefile.tc in src directory + should be enough. If you have to change anything in + the makefile.tc, please let me know. + +2. If you are using compilier other than Turbo C /Borland C or + Watcom, you should take look on fileutil.c file and possible + add couple of #ifdefs here. If your succed with it, send me a + patch (or entire modified file, if you don't know how to make + a good unix-like patch). + + +3. With 32-bit compilier you are on your own. I don't think that + small utilities like catdoc should require extender or DPMI host, + so I've never tried to build 32-bit version of catdoc for DOS, + But if you mix buffer sizes from UNIX version and file-name + dependent defines from DOS, you should probably achieve good + results. + +4. With Turbo C you'll need file getopt.c which comes with Turbo C + and unistd.h which is provided in compat directory. + Compile getopt.c and add it to cc.lib and put unistd.h in + your include directory. Later it might help you to port other + unix software. With other compilier you can also make use + of getopt.c in compat directory (which is from GNU), but I was + unable to make it work with Watcom 10.0 + +5. It is probably good idea to link wildargs.obj (or wildargv.obj) + with catdoc. I didn't do it myself becouse I use korn shell on + machine where I've developed catdoc, so I don't need to include + parameter expansion in program. diff --git a/Makefile.in b/Makefile.in new file mode 100644 index 0000000..1f92217 --- /dev/null +++ b/Makefile.in @@ -0,0 +1,26 @@ + +# Your C compilier and flags +SHELL = /bin/sh + + +all: + for i in src doc charsets; do\ + (cd $$i; $(MAKE) all);\ + done + +install: + for i in src doc charsets; do\ + (cd $$i; $(MAKE) install);\ + done +clean: + for i in src doc charsets; do\ + (cd $$i; $(MAKE) clean);\ + done +distclean: + for i in src doc charsets; do\ + (cd $$i; $(MAKE) distclean);\ + done + rm Makefile config.* +dist: + $(MAKE) -C doc dosdoc + $(MAKE) distclean diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..959897b --- /dev/null +++ b/NEWS @@ -0,0 +1,67 @@ + 0.90.1 Nov 26 1998 + Top-level Makefile now uses $MAKE instead of make + fixed missing end-line escaping in wordview.tcl + All occurences of strcpy, strcat and sprinf investigated + to avoid buffer overflows. + 0.90 Oct 29 1998 + Fixed bug with charset names redeclared locally in main() + Fixed problem in configure with wish 8.0.3 + Catdoc considered to be stable enough for release + 0.90b5 Oct 14 1998 + Fixed handling of 0x1F char (soft hyphen in Word 6.0), + now it is translated to 0x00AD (unicode soft hyphen) + Fixed permissions for manual page + Added --with-install-root configure arg to simplify + building of binary packages. + 0.90b4 September 17 1998 + Added proper configuration of library dir in wordview. + Added --disable-charset-check config option + Added 0x2026 symbol in ascii.rpl + Added more Windows codepages in distribution + 0.90b3 September 11 1998 + Added -x switch to simplify debugging of substitution maps + 0.90b2 September 10 1998 + Added some symbols is 0x2000-0x20FF range to substituton maps + These symbols occurs in cp1251 so they are frequently found + in Word files. Fixed some filename-handling problems in + wordview.tcl + + 0.90b1 September 8 1998 + Added us-ascii.charset, fixed small bugs in confugre, + install is used for all installation files. Code is + considered stable enough to be beta. + + 0.90a3 September 7 1998 + Fixed small bug in table handling, which caused catdoc to + output extra column delimiter just before row delimiter. Added + autoconf configuration. install is back, although not for + charsets + + 0.90a2 August 18 1998 + version 0.90 was tested on BSDI and Solaris platform. Makefile + was rewritten to avoid use of highly incompatible + /usr/{ucb,bin}/install + + 0.90a1 August 13 1998 + Catdoc undergone major rewrite. Now it has proper charset + handling, including UNICODE and runtime configurability. + + 0.35 - June 5 1998 + Fixed bug with -s switch which prevents catdoc from returning + non-zero code when invoked on UNIX text file + + 0.34 - Apr 28 1998 + Files now opened in binary mode thus allowing catdoc to work on + DOS and simular systems. All specs arrays now have terminating + NULL + + 0.33 - October 1997 + Fixed missing terminating NUL in specs array, which caused + random seqfaults on Linux and many other systems, becouse + _specs_ is searched by _strchr_ fynction + + 0.32 - August 1997 + First mayor public release, uploaded to CTAN. Tk interface + appeared, manual page was written. Unfortunately, this release + was buggy. + diff --git a/README b/README new file mode 100644 index 0000000..0898682 --- /dev/null +++ b/README @@ -0,0 +1,45 @@ +CATDOC version 0.93 + +CATDOC is program which reads MS-Word file and prints readable +ASCII text to stdout, just like Unix cat command. +It also able to produce correct escape sequences if some UNICODE +charachers have to be represented specially in your typesetting system +such as (La)TeX. + +This is completely new version of catdoc, rewritten from scratch. +It features runtime configuration, proper charset handling, +user-definable output formats and support +for Word97 files, which contain UNICODE internally. + +Since 0.93.0 catdoc parses OLE structure and extracts WordDocment +stream, but doesn't parse internal structure of it. + +This rough approach inevitable results in some garbage in output file, +especially near the end of file and if file contains embedded OLE objects, +such as pictures or equations. + +So, if you are looking for purely authomatic way to convert Word to LaTeX, +you can better investigate word2x, wvware or LAOLA. + + +Catdoc is distributed under GNU Public License version 2 or above. + + +Your bug reports and suggestions are welcome. + +There is also major work to do - define correct TeX commands +for accented latin letters into tex.specchars file and commands +for mathematical symbols (unicode 20xx-25xx). + + +Contributions are welcome. + +See files INSTALL and INSTALL.dos for information about compiling and +installing catdoc. + +Catdoc is documented in its UNIX-style manual page. For those who don't +have man command (i.e. MS-DOS users) plain text and postscript versions +of manual are provided in doc directory + Victor Wagner + + diff --git a/TODO b/TODO new file mode 100644 index 0000000..49a21d9 --- /dev/null +++ b/TODO @@ -0,0 +1,11 @@ +* support dual-byte (CJK) encodings as output +* Find a way to extract rowspan information from XLS. +* Make XLS2CSV to output sheet partially when memory exhausted +* Plain-text output method for XLS2CSV and its support in wordview +* textmode (ck) wordview +* Improve RTF support +* Extract text from Top Level OLE objects ??? +* Write correct TeX commands for most often used mathematical symbols + (20xx-25xx) into TeX spec chars file +* Add handling of tables & footnotes +* Fastsave support diff --git a/acconfig.h b/acconfig.h new file mode 100644 index 0000000..27fec60 --- /dev/null +++ b/acconfig.h @@ -0,0 +1,2 @@ +#undef VERSION +#undef PACKAGE diff --git a/charsets/.cvsignore b/charsets/.cvsignore new file mode 100644 index 0000000..dd98ed3 --- /dev/null +++ b/charsets/.cvsignore @@ -0,0 +1,2 @@ +Makefile +semantic.cache diff --git a/charsets/8859-1.txt b/charsets/8859-1.txt new file mode 100644 index 0000000..e402254 --- /dev/null +++ b/charsets/8859-1.txt @@ -0,0 +1,230 @@ +# +# Name: ISO 8859-1 (1987) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-1 (1987) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-1 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-1 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x00A1 # INVERTED EXCLAMATION MARK +0xA2 0x00A2 # CENT SIGN +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A4 # CURRENCY SIGN +0xA5 0x00A5 # YEN SIGN +0xA6 0x00A6 # BROKEN BAR +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00A8 # DIAERESIS +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x00AA # FEMININE ORDINAL INDICATOR +0xAB 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC # NOT SIGN +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x00AE # REGISTERED SIGN +0xAF 0x00AF # MACRON +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x00B2 # SUPERSCRIPT TWO +0xB3 0x00B3 # SUPERSCRIPT THREE +0xB4 0x00B4 # ACUTE ACCENT +0xB5 0x00B5 # MICRO SIGN +0xB6 0x00B6 # PILCROW SIGN +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x00B8 # CEDILLA +0xB9 0x00B9 # SUPERSCRIPT ONE +0xBA 0x00BA # MASCULINE ORDINAL INDICATOR +0xBB 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC # VULGAR FRACTION ONE QUARTER +0xBD 0x00BD # VULGAR FRACTION ONE HALF +0xBE 0x00BE # VULGAR FRACTION THREE QUARTERS +0xBF 0x00BF # INVERTED QUESTION MARK +0xC0 0x00C0 # LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 # LATIN CAPITAL LETTER AE +0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 # LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x00CC # LATIN CAPITAL LETTER I WITH GRAVE +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x00D0 # LATIN CAPITAL LETTER ETH (Icelandic) +0xD1 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x00D2 # LATIN CAPITAL LETTER O WITH GRAVE +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 # MULTIPLICATION SIGN +0xD8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x00D9 # LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x00DE # LATIN CAPITAL LETTER THORN (Icelandic) +0xDF 0x00DF # LATIN SMALL LETTER SHARP S (German) +0xE0 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 # LATIN SMALL LETTER AE +0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x00EC # LATIN SMALL LETTER I WITH GRAVE +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x00F0 # LATIN SMALL LETTER ETH (Icelandic) +0xF1 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0xF2 0x00F2 # LATIN SMALL LETTER O WITH GRAVE +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 # DIVISION SIGN +0xF8 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xF9 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD # LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x00FE # LATIN SMALL LETTER THORN (Icelandic) +0xFF 0x00FF # LATIN SMALL LETTER Y WITH DIAERESIS diff --git a/charsets/8859-10.txt b/charsets/8859-10.txt new file mode 100644 index 0000000..374a42b --- /dev/null +++ b/charsets/8859-10.txt @@ -0,0 +1,303 @@ +# +# Name: ISO/IEC 8859-10:1998 to Unicode +# Unicode version: 3.0 +# Table version: 1.1 +# Table format: Format A +# Date: 1999 October 11 +# Authors: Ken Whistler +# +# Copyright (c) 1999 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on optical media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Unicode, Inc. hereby grants the right to freely use the information +# supplied in this file in the creation of products supporting the +# Unicode Standard, and to make copies of this file in any form for +# internal or external distribution as long as this notice remains +# attached. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO/IEC 8859-10:1998 characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO/IEC 8859-10 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO/IEC 8859-10 order. +# +# Version history +# 1.0 version new. +# 1.1 corrected mistake in mapping of 0xA4 +# +# Updated versions of this file may be found in: +# +# +# Any comments or problems, contact +# Please note that is an archival address; +# notices will be checked, but do not expect an immediate response. +# +0x00 0x0000 # NULL +0x01 0x0001 # START OF HEADING +0x02 0x0002 # START OF TEXT +0x03 0x0003 # END OF TEXT +0x04 0x0004 # END OF TRANSMISSION +0x05 0x0005 # ENQUIRY +0x06 0x0006 # ACKNOWLEDGE +0x07 0x0007 # BELL +0x08 0x0008 # BACKSPACE +0x09 0x0009 # HORIZONTAL TABULATION +0x0A 0x000A # LINE FEED +0x0B 0x000B # VERTICAL TABULATION +0x0C 0x000C # FORM FEED +0x0D 0x000D # CARRIAGE RETURN +0x0E 0x000E # SHIFT OUT +0x0F 0x000F # SHIFT IN +0x10 0x0010 # DATA LINK ESCAPE +0x11 0x0011 # DEVICE CONTROL ONE +0x12 0x0012 # DEVICE CONTROL TWO +0x13 0x0013 # DEVICE CONTROL THREE +0x14 0x0014 # DEVICE CONTROL FOUR +0x15 0x0015 # NEGATIVE ACKNOWLEDGE +0x16 0x0016 # SYNCHRONOUS IDLE +0x17 0x0017 # END OF TRANSMISSION BLOCK +0x18 0x0018 # CANCEL +0x19 0x0019 # END OF MEDIUM +0x1A 0x001A # SUBSTITUTE +0x1B 0x001B # ESCAPE +0x1C 0x001C # FILE SEPARATOR +0x1D 0x001D # GROUP SEPARATOR +0x1E 0x001E # RECORD SEPARATOR +0x1F 0x001F # UNIT SEPARATOR +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0x7F 0x007F # DELETE +0x80 0x0080 # +0x81 0x0081 # +0x82 0x0082 # +0x83 0x0083 # +0x84 0x0084 # +0x85 0x0085 # +0x86 0x0086 # +0x87 0x0087 # +0x88 0x0088 # +0x89 0x0089 # +0x8A 0x008A # +0x8B 0x008B # +0x8C 0x008C # +0x8D 0x008D # +0x8E 0x008E # +0x8F 0x008F # +0x90 0x0090 # +0x91 0x0091 # +0x92 0x0092 # +0x93 0x0093 # +0x94 0x0094 # +0x95 0x0095 # +0x96 0x0096 # +0x97 0x0097 # +0x98 0x0098 # +0x99 0x0099 # +0x9A 0x009A # +0x9B 0x009B # +0x9C 0x009C # +0x9D 0x009D # +0x9E 0x009E # +0x9F 0x009F # +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x0104 # LATIN CAPITAL LETTER A WITH OGONEK +0xA2 0x0112 # LATIN CAPITAL LETTER E WITH MACRON +0xA3 0x0122 # LATIN CAPITAL LETTER G WITH CEDILLA +0xA4 0x012A # LATIN CAPITAL LETTER I WITH MACRON +0xA5 0x0128 # LATIN CAPITAL LETTER I WITH TILDE +0xA6 0x0136 # LATIN CAPITAL LETTER K WITH CEDILLA +0xA7 0x00A7 # SECTION SIGN +0xA8 0x013B # LATIN CAPITAL LETTER L WITH CEDILLA +0xA9 0x0110 # LATIN CAPITAL LETTER D WITH STROKE +0xAA 0x0160 # LATIN CAPITAL LETTER S WITH CARON +0xAB 0x0166 # LATIN CAPITAL LETTER T WITH STROKE +0xAC 0x017D # LATIN CAPITAL LETTER Z WITH CARON +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x016A # LATIN CAPITAL LETTER U WITH MACRON +0xAF 0x014A # LATIN CAPITAL LETTER ENG +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x0105 # LATIN SMALL LETTER A WITH OGONEK +0xB2 0x0113 # LATIN SMALL LETTER E WITH MACRON +0xB3 0x0123 # LATIN SMALL LETTER G WITH CEDILLA +0xB4 0x012B # LATIN SMALL LETTER I WITH MACRON +0xB5 0x0129 # LATIN SMALL LETTER I WITH TILDE +0xB6 0x0137 # LATIN SMALL LETTER K WITH CEDILLA +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x013C # LATIN SMALL LETTER L WITH CEDILLA +0xB9 0x0111 # LATIN SMALL LETTER D WITH STROKE +0xBA 0x0161 # LATIN SMALL LETTER S WITH CARON +0xBB 0x0167 # LATIN SMALL LETTER T WITH STROKE +0xBC 0x017E # LATIN SMALL LETTER Z WITH CARON +0xBD 0x2015 # HORIZONTAL BAR +0xBE 0x016B # LATIN SMALL LETTER U WITH MACRON +0xBF 0x014B # LATIN SMALL LETTER ENG +0xC0 0x0100 # LATIN CAPITAL LETTER A WITH MACRON +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 # LATIN CAPITAL LETTER AE +0xC7 0x012E # LATIN CAPITAL LETTER I WITH OGONEK +0xC8 0x010C # LATIN CAPITAL LETTER C WITH CARON +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x0118 # LATIN CAPITAL LETTER E WITH OGONEK +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x0116 # LATIN CAPITAL LETTER E WITH DOT ABOVE +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x00D0 # LATIN CAPITAL LETTER ETH (Icelandic) +0xD1 0x0145 # LATIN CAPITAL LETTER N WITH CEDILLA +0xD2 0x014C # LATIN CAPITAL LETTER O WITH MACRON +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x0168 # LATIN CAPITAL LETTER U WITH TILDE +0xD8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x0172 # LATIN CAPITAL LETTER U WITH OGONEK +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x00DE # LATIN CAPITAL LETTER THORN (Icelandic) +0xDF 0x00DF # LATIN SMALL LETTER SHARP S (German) +0xE0 0x0101 # LATIN SMALL LETTER A WITH MACRON +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 # LATIN SMALL LETTER AE +0xE7 0x012F # LATIN SMALL LETTER I WITH OGONEK +0xE8 0x010D # LATIN SMALL LETTER C WITH CARON +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x0119 # LATIN SMALL LETTER E WITH OGONEK +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x0117 # LATIN SMALL LETTER E WITH DOT ABOVE +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x00F0 # LATIN SMALL LETTER ETH (Icelandic) +0xF1 0x0146 # LATIN SMALL LETTER N WITH CEDILLA +0xF2 0x014D # LATIN SMALL LETTER O WITH MACRON +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x0169 # LATIN SMALL LETTER U WITH TILDE +0xF8 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xF9 0x0173 # LATIN SMALL LETTER U WITH OGONEK +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD # LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x00FE # LATIN SMALL LETTER THORN (Icelandic) +0xFF 0x0138 # LATIN SMALL LETTER KRA diff --git a/charsets/8859-11.txt b/charsets/8859-11.txt new file mode 100644 index 0000000..192bd9d --- /dev/null +++ b/charsets/8859-11.txt @@ -0,0 +1,297 @@ +# +# Name: ISO/IEC 8859-11:2001 to Unicode +# Unicode version: 3.2 +# Table version: 1.0 +# Table format: Format A +# Date: 2002 October 7 +# Authors: Ken Whistler +# +# Copyright (c) 2002 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on optical media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Unicode, Inc. hereby grants the right to freely use the information +# supplied in this file in the creation of products supporting the +# Unicode Standard, and to make copies of this file in any form for +# internal or external distribution as long as this notice remains +# attached. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO/IEC 8859-11:2001 characters map into Unicode. +# +# ISO/IEC 8859-11:2001 is equivalent to TIS 620-2533 (1990) with +# the addition of 0xA0 NO-BREAK SPACE. +# +# Format: Three tab-separated columns +# Column #1 is the ISO/IEC 8859-11 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO/IEC 8859-11 order. +# +# Version history: +# 2002 October 7 Created +# +# Updated versions of this file may be found in: +# +# +# For any comments or problems, please use the Unicode +# web contact form at: +# http://www.unicode.org/unicode/reporting.html +# +0x00 0x0000 # NULL +0x01 0x0001 # START OF HEADING +0x02 0x0002 # START OF TEXT +0x03 0x0003 # END OF TEXT +0x04 0x0004 # END OF TRANSMISSION +0x05 0x0005 # ENQUIRY +0x06 0x0006 # ACKNOWLEDGE +0x07 0x0007 # BELL +0x08 0x0008 # BACKSPACE +0x09 0x0009 # HORIZONTAL TABULATION +0x0A 0x000A # LINE FEED +0x0B 0x000B # VERTICAL TABULATION +0x0C 0x000C # FORM FEED +0x0D 0x000D # CARRIAGE RETURN +0x0E 0x000E # SHIFT OUT +0x0F 0x000F # SHIFT IN +0x10 0x0010 # DATA LINK ESCAPE +0x11 0x0011 # DEVICE CONTROL ONE +0x12 0x0012 # DEVICE CONTROL TWO +0x13 0x0013 # DEVICE CONTROL THREE +0x14 0x0014 # DEVICE CONTROL FOUR +0x15 0x0015 # NEGATIVE ACKNOWLEDGE +0x16 0x0016 # SYNCHRONOUS IDLE +0x17 0x0017 # END OF TRANSMISSION BLOCK +0x18 0x0018 # CANCEL +0x19 0x0019 # END OF MEDIUM +0x1A 0x001A # SUBSTITUTE +0x1B 0x001B # ESCAPE +0x1C 0x001C # FILE SEPARATOR +0x1D 0x001D # GROUP SEPARATOR +0x1E 0x001E # RECORD SEPARATOR +0x1F 0x001F # UNIT SEPARATOR +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0x7F 0x007F # DELETE +0x80 0x0080 # +0x81 0x0081 # +0x82 0x0082 # +0x83 0x0083 # +0x84 0x0084 # +0x85 0x0085 # +0x86 0x0086 # +0x87 0x0087 # +0x88 0x0088 # +0x89 0x0089 # +0x8A 0x008A # +0x8B 0x008B # +0x8C 0x008C # +0x8D 0x008D # +0x8E 0x008E # +0x8F 0x008F # +0x90 0x0090 # +0x91 0x0091 # +0x92 0x0092 # +0x93 0x0093 # +0x94 0x0094 # +0x95 0x0095 # +0x96 0x0096 # +0x97 0x0097 # +0x98 0x0098 # +0x99 0x0099 # +0x9A 0x009A # +0x9B 0x009B # +0x9C 0x009C # +0x9D 0x009D # +0x9E 0x009E # +0x9F 0x009F # +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x0E01 # THAI CHARACTER KO KAI +0xA2 0x0E02 # THAI CHARACTER KHO KHAI +0xA3 0x0E03 # THAI CHARACTER KHO KHUAT +0xA4 0x0E04 # THAI CHARACTER KHO KHWAI +0xA5 0x0E05 # THAI CHARACTER KHO KHON +0xA6 0x0E06 # THAI CHARACTER KHO RAKHANG +0xA7 0x0E07 # THAI CHARACTER NGO NGU +0xA8 0x0E08 # THAI CHARACTER CHO CHAN +0xA9 0x0E09 # THAI CHARACTER CHO CHING +0xAA 0x0E0A # THAI CHARACTER CHO CHANG +0xAB 0x0E0B # THAI CHARACTER SO SO +0xAC 0x0E0C # THAI CHARACTER CHO CHOE +0xAD 0x0E0D # THAI CHARACTER YO YING +0xAE 0x0E0E # THAI CHARACTER DO CHADA +0xAF 0x0E0F # THAI CHARACTER TO PATAK +0xB0 0x0E10 # THAI CHARACTER THO THAN +0xB1 0x0E11 # THAI CHARACTER THO NANGMONTHO +0xB2 0x0E12 # THAI CHARACTER THO PHUTHAO +0xB3 0x0E13 # THAI CHARACTER NO NEN +0xB4 0x0E14 # THAI CHARACTER DO DEK +0xB5 0x0E15 # THAI CHARACTER TO TAO +0xB6 0x0E16 # THAI CHARACTER THO THUNG +0xB7 0x0E17 # THAI CHARACTER THO THAHAN +0xB8 0x0E18 # THAI CHARACTER THO THONG +0xB9 0x0E19 # THAI CHARACTER NO NU +0xBA 0x0E1A # THAI CHARACTER BO BAIMAI +0xBB 0x0E1B # THAI CHARACTER PO PLA +0xBC 0x0E1C # THAI CHARACTER PHO PHUNG +0xBD 0x0E1D # THAI CHARACTER FO FA +0xBE 0x0E1E # THAI CHARACTER PHO PHAN +0xBF 0x0E1F # THAI CHARACTER FO FAN +0xC0 0x0E20 # THAI CHARACTER PHO SAMPHAO +0xC1 0x0E21 # THAI CHARACTER MO MA +0xC2 0x0E22 # THAI CHARACTER YO YAK +0xC3 0x0E23 # THAI CHARACTER RO RUA +0xC4 0x0E24 # THAI CHARACTER RU +0xC5 0x0E25 # THAI CHARACTER LO LING +0xC6 0x0E26 # THAI CHARACTER LU +0xC7 0x0E27 # THAI CHARACTER WO WAEN +0xC8 0x0E28 # THAI CHARACTER SO SALA +0xC9 0x0E29 # THAI CHARACTER SO RUSI +0xCA 0x0E2A # THAI CHARACTER SO SUA +0xCB 0x0E2B # THAI CHARACTER HO HIP +0xCC 0x0E2C # THAI CHARACTER LO CHULA +0xCD 0x0E2D # THAI CHARACTER O ANG +0xCE 0x0E2E # THAI CHARACTER HO NOKHUK +0xCF 0x0E2F # THAI CHARACTER PAIYANNOI +0xD0 0x0E30 # THAI CHARACTER SARA A +0xD1 0x0E31 # THAI CHARACTER MAI HAN-AKAT +0xD2 0x0E32 # THAI CHARACTER SARA AA +0xD3 0x0E33 # THAI CHARACTER SARA AM +0xD4 0x0E34 # THAI CHARACTER SARA I +0xD5 0x0E35 # THAI CHARACTER SARA II +0xD6 0x0E36 # THAI CHARACTER SARA UE +0xD7 0x0E37 # THAI CHARACTER SARA UEE +0xD8 0x0E38 # THAI CHARACTER SARA U +0xD9 0x0E39 # THAI CHARACTER SARA UU +0xDA 0x0E3A # THAI CHARACTER PHINTHU +0xDF 0x0E3F # THAI CURRENCY SYMBOL BAHT +0xE0 0x0E40 # THAI CHARACTER SARA E +0xE1 0x0E41 # THAI CHARACTER SARA AE +0xE2 0x0E42 # THAI CHARACTER SARA O +0xE3 0x0E43 # THAI CHARACTER SARA AI MAIMUAN +0xE4 0x0E44 # THAI CHARACTER SARA AI MAIMALAI +0xE5 0x0E45 # THAI CHARACTER LAKKHANGYAO +0xE6 0x0E46 # THAI CHARACTER MAIYAMOK +0xE7 0x0E47 # THAI CHARACTER MAITAIKHU +0xE8 0x0E48 # THAI CHARACTER MAI EK +0xE9 0x0E49 # THAI CHARACTER MAI THO +0xEA 0x0E4A # THAI CHARACTER MAI TRI +0xEB 0x0E4B # THAI CHARACTER MAI CHATTAWA +0xEC 0x0E4C # THAI CHARACTER THANTHAKHAT +0xED 0x0E4D # THAI CHARACTER NIKHAHIT +0xEE 0x0E4E # THAI CHARACTER YAMAKKAN +0xEF 0x0E4F # THAI CHARACTER FONGMAN +0xF0 0x0E50 # THAI DIGIT ZERO +0xF1 0x0E51 # THAI DIGIT ONE +0xF2 0x0E52 # THAI DIGIT TWO +0xF3 0x0E53 # THAI DIGIT THREE +0xF4 0x0E54 # THAI DIGIT FOUR +0xF5 0x0E55 # THAI DIGIT FIVE +0xF6 0x0E56 # THAI DIGIT SIX +0xF7 0x0E57 # THAI DIGIT SEVEN +0xF8 0x0E58 # THAI DIGIT EIGHT +0xF9 0x0E59 # THAI DIGIT NINE +0xFA 0x0E5A # THAI CHARACTER ANGKHANKHU +0xFB 0x0E5B # THAI CHARACTER KHOMUT diff --git a/charsets/8859-13.txt b/charsets/8859-13.txt new file mode 100644 index 0000000..cd11b53 --- /dev/null +++ b/charsets/8859-13.txt @@ -0,0 +1,299 @@ +# +# Name: ISO/IEC 8859-13:1998 to Unicode +# Unicode version: 3.0 +# Table version: 1.0 +# Table format: Format A +# Date: 1999 July 27 +# Authors: Ken Whistler +# +# Copyright (c) 1998 - 1999 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on optical media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Unicode, Inc. hereby grants the right to freely use the information +# supplied in this file in the creation of products supporting the +# Unicode Standard, and to make copies of this file in any form for +# internal or external distribution as long as this notice remains +# attached. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO/IEC 8859-13:1998 characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO/IEC 8859-13 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO/IEC 8859-13 order. +# +# Updated versions of this file may be found in: +# +# +# Any comments or problems, contact +# Please note that is an archival address; +# notices will be checked, but do not expect an immediate response. +# +0x00 0x0000 # NULL +0x01 0x0001 # START OF HEADING +0x02 0x0002 # START OF TEXT +0x03 0x0003 # END OF TEXT +0x04 0x0004 # END OF TRANSMISSION +0x05 0x0005 # ENQUIRY +0x06 0x0006 # ACKNOWLEDGE +0x07 0x0007 # BELL +0x08 0x0008 # BACKSPACE +0x09 0x0009 # HORIZONTAL TABULATION +0x0A 0x000A # LINE FEED +0x0B 0x000B # VERTICAL TABULATION +0x0C 0x000C # FORM FEED +0x0D 0x000D # CARRIAGE RETURN +0x0E 0x000E # SHIFT OUT +0x0F 0x000F # SHIFT IN +0x10 0x0010 # DATA LINK ESCAPE +0x11 0x0011 # DEVICE CONTROL ONE +0x12 0x0012 # DEVICE CONTROL TWO +0x13 0x0013 # DEVICE CONTROL THREE +0x14 0x0014 # DEVICE CONTROL FOUR +0x15 0x0015 # NEGATIVE ACKNOWLEDGE +0x16 0x0016 # SYNCHRONOUS IDLE +0x17 0x0017 # END OF TRANSMISSION BLOCK +0x18 0x0018 # CANCEL +0x19 0x0019 # END OF MEDIUM +0x1A 0x001A # SUBSTITUTE +0x1B 0x001B # ESCAPE +0x1C 0x001C # FILE SEPARATOR +0x1D 0x001D # GROUP SEPARATOR +0x1E 0x001E # RECORD SEPARATOR +0x1F 0x001F # UNIT SEPARATOR +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0x7F 0x007F # DELETE +0x80 0x0080 # +0x81 0x0081 # +0x82 0x0082 # +0x83 0x0083 # +0x84 0x0084 # +0x85 0x0085 # +0x86 0x0086 # +0x87 0x0087 # +0x88 0x0088 # +0x89 0x0089 # +0x8A 0x008A # +0x8B 0x008B # +0x8C 0x008C # +0x8D 0x008D # +0x8E 0x008E # +0x8F 0x008F # +0x90 0x0090 # +0x91 0x0091 # +0x92 0x0092 # +0x93 0x0093 # +0x94 0x0094 # +0x95 0x0095 # +0x96 0x0096 # +0x97 0x0097 # +0x98 0x0098 # +0x99 0x0099 # +0x9A 0x009A # +0x9B 0x009B # +0x9C 0x009C # +0x9D 0x009D # +0x9E 0x009E # +0x9F 0x009F # +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x201D # RIGHT DOUBLE QUOTATION MARK +0xA2 0x00A2 # CENT SIGN +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A4 # CURRENCY SIGN +0xA5 0x201E # DOUBLE LOW-9 QUOTATION MARK +0xA6 0x00A6 # BROKEN BAR +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x0156 # LATIN CAPITAL LETTER R WITH CEDILLA +0xAB 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC # NOT SIGN +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x00AE # REGISTERED SIGN +0xAF 0x00C6 # LATIN CAPITAL LETTER AE +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x00B2 # SUPERSCRIPT TWO +0xB3 0x00B3 # SUPERSCRIPT THREE +0xB4 0x201C # LEFT DOUBLE QUOTATION MARK +0xB5 0x00B5 # MICRO SIGN +0xB6 0x00B6 # PILCROW SIGN +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xB9 0x00B9 # SUPERSCRIPT ONE +0xBA 0x0157 # LATIN SMALL LETTER R WITH CEDILLA +0xBB 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC # VULGAR FRACTION ONE QUARTER +0xBD 0x00BD # VULGAR FRACTION ONE HALF +0xBE 0x00BE # VULGAR FRACTION THREE QUARTERS +0xBF 0x00E6 # LATIN SMALL LETTER AE +0xC0 0x0104 # LATIN CAPITAL LETTER A WITH OGONEK +0xC1 0x012E # LATIN CAPITAL LETTER I WITH OGONEK +0xC2 0x0100 # LATIN CAPITAL LETTER A WITH MACRON +0xC3 0x0106 # LATIN CAPITAL LETTER C WITH ACUTE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x0118 # LATIN CAPITAL LETTER E WITH OGONEK +0xC7 0x0112 # LATIN CAPITAL LETTER E WITH MACRON +0xC8 0x010C # LATIN CAPITAL LETTER C WITH CARON +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x0179 # LATIN CAPITAL LETTER Z WITH ACUTE +0xCB 0x0116 # LATIN CAPITAL LETTER E WITH DOT ABOVE +0xCC 0x0122 # LATIN CAPITAL LETTER G WITH CEDILLA +0xCD 0x0136 # LATIN CAPITAL LETTER K WITH CEDILLA +0xCE 0x012A # LATIN CAPITAL LETTER I WITH MACRON +0xCF 0x013B # LATIN CAPITAL LETTER L WITH CEDILLA +0xD0 0x0160 # LATIN CAPITAL LETTER S WITH CARON +0xD1 0x0143 # LATIN CAPITAL LETTER N WITH ACUTE +0xD2 0x0145 # LATIN CAPITAL LETTER N WITH CEDILLA +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x014C # LATIN CAPITAL LETTER O WITH MACRON +0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 # MULTIPLICATION SIGN +0xD8 0x0172 # LATIN CAPITAL LETTER U WITH OGONEK +0xD9 0x0141 # LATIN CAPITAL LETTER L WITH STROKE +0xDA 0x015A # LATIN CAPITAL LETTER S WITH ACUTE +0xDB 0x016A # LATIN CAPITAL LETTER U WITH MACRON +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x017B # LATIN CAPITAL LETTER Z WITH DOT ABOVE +0xDE 0x017D # LATIN CAPITAL LETTER Z WITH CARON +0xDF 0x00DF # LATIN SMALL LETTER SHARP S (German) +0xE0 0x0105 # LATIN SMALL LETTER A WITH OGONEK +0xE1 0x012F # LATIN SMALL LETTER I WITH OGONEK +0xE2 0x0101 # LATIN SMALL LETTER A WITH MACRON +0xE3 0x0107 # LATIN SMALL LETTER C WITH ACUTE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x0119 # LATIN SMALL LETTER E WITH OGONEK +0xE7 0x0113 # LATIN SMALL LETTER E WITH MACRON +0xE8 0x010D # LATIN SMALL LETTER C WITH CARON +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x017A # LATIN SMALL LETTER Z WITH ACUTE +0xEB 0x0117 # LATIN SMALL LETTER E WITH DOT ABOVE +0xEC 0x0123 # LATIN SMALL LETTER G WITH CEDILLA +0xED 0x0137 # LATIN SMALL LETTER K WITH CEDILLA +0xEE 0x012B # LATIN SMALL LETTER I WITH MACRON +0xEF 0x013C # LATIN SMALL LETTER L WITH CEDILLA +0xF0 0x0161 # LATIN SMALL LETTER S WITH CARON +0xF1 0x0144 # LATIN SMALL LETTER N WITH ACUTE +0xF2 0x0146 # LATIN SMALL LETTER N WITH CEDILLA +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x014D # LATIN SMALL LETTER O WITH MACRON +0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 # DIVISION SIGN +0xF8 0x0173 # LATIN SMALL LETTER U WITH OGONEK +0xF9 0x0142 # LATIN SMALL LETTER L WITH STROKE +0xFA 0x015B # LATIN SMALL LETTER S WITH ACUTE +0xFB 0x016B # LATIN SMALL LETTER U WITH MACRON +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x017C # LATIN SMALL LETTER Z WITH DOT ABOVE +0xFE 0x017E # LATIN SMALL LETTER Z WITH CARON +0xFF 0x2019 # RIGHT SINGLE QUOTATION MARK diff --git a/charsets/8859-14.txt b/charsets/8859-14.txt new file mode 100644 index 0000000..36038f4 --- /dev/null +++ b/charsets/8859-14.txt @@ -0,0 +1,301 @@ +# +# Name: ISO/IEC 8859-14:1998 to Unicode +# Unicode version: 3.0 +# Table version: 1.0 +# Table format: Format A +# Date: 1999 July 27 +# Authors: Markus Kuhn +# Ken Whistler +# +# Copyright (c) 1998 - 1999 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on optical media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Unicode, Inc. hereby grants the right to freely use the information +# supplied in this file in the creation of products supporting the +# Unicode Standard, and to make copies of this file in any form for +# internal or external distribution as long as this notice remains +# attached. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO/IEC 8859-14:1998 characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO/IEC 8859-14 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO/IEC 8859-14 order. +# +# Updated versions of this file may be found in: +# +# +# Any comments or problems, contact +# Please note that is an archival address; +# notices will be checked, but do not expect an immediate response. +# +0x00 0x0000 # NULL +0x01 0x0001 # START OF HEADING +0x02 0x0002 # START OF TEXT +0x03 0x0003 # END OF TEXT +0x04 0x0004 # END OF TRANSMISSION +0x05 0x0005 # ENQUIRY +0x06 0x0006 # ACKNOWLEDGE +0x07 0x0007 # BELL +0x08 0x0008 # BACKSPACE +0x09 0x0009 # HORIZONTAL TABULATION +0x0A 0x000A # LINE FEED +0x0B 0x000B # VERTICAL TABULATION +0x0C 0x000C # FORM FEED +0x0D 0x000D # CARRIAGE RETURN +0x0E 0x000E # SHIFT OUT +0x0F 0x000F # SHIFT IN +0x10 0x0010 # DATA LINK ESCAPE +0x11 0x0011 # DEVICE CONTROL ONE +0x12 0x0012 # DEVICE CONTROL TWO +0x13 0x0013 # DEVICE CONTROL THREE +0x14 0x0014 # DEVICE CONTROL FOUR +0x15 0x0015 # NEGATIVE ACKNOWLEDGE +0x16 0x0016 # SYNCHRONOUS IDLE +0x17 0x0017 # END OF TRANSMISSION BLOCK +0x18 0x0018 # CANCEL +0x19 0x0019 # END OF MEDIUM +0x1A 0x001A # SUBSTITUTE +0x1B 0x001B # ESCAPE +0x1C 0x001C # FILE SEPARATOR +0x1D 0x001D # GROUP SEPARATOR +0x1E 0x001E # RECORD SEPARATOR +0x1F 0x001F # UNIT SEPARATOR +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0x7F 0x007F # DELETE +0x80 0x0080 # +0x81 0x0081 # +0x82 0x0082 # +0x83 0x0083 # +0x84 0x0084 # +0x85 0x0085 # +0x86 0x0086 # +0x87 0x0087 # +0x88 0x0088 # +0x89 0x0089 # +0x8A 0x008A # +0x8B 0x008B # +0x8C 0x008C # +0x8D 0x008D # +0x8E 0x008E # +0x8F 0x008F # +0x90 0x0090 # +0x91 0x0091 # +0x92 0x0092 # +0x93 0x0093 # +0x94 0x0094 # +0x95 0x0095 # +0x96 0x0096 # +0x97 0x0097 # +0x98 0x0098 # +0x99 0x0099 # +0x9A 0x009A # +0x9B 0x009B # +0x9C 0x009C # +0x9D 0x009D # +0x9E 0x009E # +0x9F 0x009F # +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x1E02 # LATIN CAPITAL LETTER B WITH DOT ABOVE +0xA2 0x1E03 # LATIN SMALL LETTER B WITH DOT ABOVE +0xA3 0x00A3 # POUND SIGN +0xA4 0x010A # LATIN CAPITAL LETTER C WITH DOT ABOVE +0xA5 0x010B # LATIN SMALL LETTER C WITH DOT ABOVE +0xA6 0x1E0A # LATIN CAPITAL LETTER D WITH DOT ABOVE +0xA7 0x00A7 # SECTION SIGN +0xA8 0x1E80 # LATIN CAPITAL LETTER W WITH GRAVE +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x1E82 # LATIN CAPITAL LETTER W WITH ACUTE +0xAB 0x1E0B # LATIN SMALL LETTER D WITH DOT ABOVE +0xAC 0x1EF2 # LATIN CAPITAL LETTER Y WITH GRAVE +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x00AE # REGISTERED SIGN +0xAF 0x0178 # LATIN CAPITAL LETTER Y WITH DIAERESIS +0xB0 0x1E1E # LATIN CAPITAL LETTER F WITH DOT ABOVE +0xB1 0x1E1F # LATIN SMALL LETTER F WITH DOT ABOVE +0xB2 0x0120 # LATIN CAPITAL LETTER G WITH DOT ABOVE +0xB3 0x0121 # LATIN SMALL LETTER G WITH DOT ABOVE +0xB4 0x1E40 # LATIN CAPITAL LETTER M WITH DOT ABOVE +0xB5 0x1E41 # LATIN SMALL LETTER M WITH DOT ABOVE +0xB6 0x00B6 # PILCROW SIGN +0xB7 0x1E56 # LATIN CAPITAL LETTER P WITH DOT ABOVE +0xB8 0x1E81 # LATIN SMALL LETTER W WITH GRAVE +0xB9 0x1E57 # LATIN SMALL LETTER P WITH DOT ABOVE +0xBA 0x1E83 # LATIN SMALL LETTER W WITH ACUTE +0xBB 0x1E60 # LATIN CAPITAL LETTER S WITH DOT ABOVE +0xBC 0x1EF3 # LATIN SMALL LETTER Y WITH GRAVE +0xBD 0x1E84 # LATIN CAPITAL LETTER W WITH DIAERESIS +0xBE 0x1E85 # LATIN SMALL LETTER W WITH DIAERESIS +0xBF 0x1E61 # LATIN SMALL LETTER S WITH DOT ABOVE +0xC0 0x00C0 # LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 # LATIN CAPITAL LETTER AE +0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 # LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x00CC # LATIN CAPITAL LETTER I WITH GRAVE +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x0174 # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0xD1 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x00D2 # LATIN CAPITAL LETTER O WITH GRAVE +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x1E6A # LATIN CAPITAL LETTER T WITH DOT ABOVE +0xD8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x00D9 # LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x0176 # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0xDF 0x00DF # LATIN SMALL LETTER SHARP S +0xE0 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 # LATIN SMALL LETTER AE +0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x00EC # LATIN SMALL LETTER I WITH GRAVE +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x0175 # LATIN SMALL LETTER W WITH CIRCUMFLEX +0xF1 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0xF2 0x00F2 # LATIN SMALL LETTER O WITH GRAVE +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x1E6B # LATIN SMALL LETTER T WITH DOT ABOVE +0xF8 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xF9 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD # LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x0177 # LATIN SMALL LETTER Y WITH CIRCUMFLEX +0xFF 0x00FF # LATIN SMALL LETTER Y WITH DIAERESIS + diff --git a/charsets/8859-15.txt b/charsets/8859-15.txt new file mode 100644 index 0000000..1e31970 --- /dev/null +++ b/charsets/8859-15.txt @@ -0,0 +1,303 @@ +# +# Name: ISO/IEC 8859-15:1999 to Unicode +# Unicode version: 3.0 +# Table version: 1.0 +# Table format: Format A +# Date: 1999 July 27 +# Authors: Markus Kuhn +# Ken Whistler +# +# Copyright (c) 1998 - 1999 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on optical media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Unicode, Inc. hereby grants the right to freely use the information +# supplied in this file in the creation of products supporting the +# Unicode Standard, and to make copies of this file in any form for +# internal or external distribution as long as this notice remains +# attached. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO/IEC 8859-15:1999 characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO/IEC 8859-15 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO/IEC 8859-15 order. +# +# Version history +# +# Updated versions of this file may be found in: +# +# +# Any comments or problems, contact +# Please note that is an archival address; +# notices will be checked, but do not expect an immediate response. +# +0x00 0x0000 # NULL +0x01 0x0001 # START OF HEADING +0x02 0x0002 # START OF TEXT +0x03 0x0003 # END OF TEXT +0x04 0x0004 # END OF TRANSMISSION +0x05 0x0005 # ENQUIRY +0x06 0x0006 # ACKNOWLEDGE +0x07 0x0007 # BELL +0x08 0x0008 # BACKSPACE +0x09 0x0009 # HORIZONTAL TABULATION +0x0A 0x000A # LINE FEED +0x0B 0x000B # VERTICAL TABULATION +0x0C 0x000C # FORM FEED +0x0D 0x000D # CARRIAGE RETURN +0x0E 0x000E # SHIFT OUT +0x0F 0x000F # SHIFT IN +0x10 0x0010 # DATA LINK ESCAPE +0x11 0x0011 # DEVICE CONTROL ONE +0x12 0x0012 # DEVICE CONTROL TWO +0x13 0x0013 # DEVICE CONTROL THREE +0x14 0x0014 # DEVICE CONTROL FOUR +0x15 0x0015 # NEGATIVE ACKNOWLEDGE +0x16 0x0016 # SYNCHRONOUS IDLE +0x17 0x0017 # END OF TRANSMISSION BLOCK +0x18 0x0018 # CANCEL +0x19 0x0019 # END OF MEDIUM +0x1A 0x001A # SUBSTITUTE +0x1B 0x001B # ESCAPE +0x1C 0x001C # FILE SEPARATOR +0x1D 0x001D # GROUP SEPARATOR +0x1E 0x001E # RECORD SEPARATOR +0x1F 0x001F # UNIT SEPARATOR +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0x7F 0x007F # DELETE +0x80 0x0080 # +0x81 0x0081 # +0x82 0x0082 # +0x83 0x0083 # +0x84 0x0084 # +0x85 0x0085 # +0x86 0x0086 # +0x87 0x0087 # +0x88 0x0088 # +0x89 0x0089 # +0x8A 0x008A # +0x8B 0x008B # +0x8C 0x008C # +0x8D 0x008D # +0x8E 0x008E # +0x8F 0x008F # +0x90 0x0090 # +0x91 0x0091 # +0x92 0x0092 # +0x93 0x0093 # +0x94 0x0094 # +0x95 0x0095 # +0x96 0x0096 # +0x97 0x0097 # +0x98 0x0098 # +0x99 0x0099 # +0x9A 0x009A # +0x9B 0x009B # +0x9C 0x009C # +0x9D 0x009D # +0x9E 0x009E # +0x9F 0x009F # +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x00A1 # INVERTED EXCLAMATION MARK +0xA2 0x00A2 # CENT SIGN +0xA3 0x00A3 # POUND SIGN +0xA4 0x20AC # EURO SIGN +0xA5 0x00A5 # YEN SIGN +0xA6 0x0160 # LATIN CAPITAL LETTER S WITH CARON +0xA7 0x00A7 # SECTION SIGN +0xA8 0x0161 # LATIN SMALL LETTER S WITH CARON +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x00AA # FEMININE ORDINAL INDICATOR +0xAB 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC # NOT SIGN +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x00AE # REGISTERED SIGN +0xAF 0x00AF # MACRON +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x00B2 # SUPERSCRIPT TWO +0xB3 0x00B3 # SUPERSCRIPT THREE +0xB4 0x017D # LATIN CAPITAL LETTER Z WITH CARON +0xB5 0x00B5 # MICRO SIGN +0xB6 0x00B6 # PILCROW SIGN +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x017E # LATIN SMALL LETTER Z WITH CARON +0xB9 0x00B9 # SUPERSCRIPT ONE +0xBA 0x00BA # MASCULINE ORDINAL INDICATOR +0xBB 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x0152 # LATIN CAPITAL LIGATURE OE +0xBD 0x0153 # LATIN SMALL LIGATURE OE +0xBE 0x0178 # LATIN CAPITAL LETTER Y WITH DIAERESIS +0xBF 0x00BF # INVERTED QUESTION MARK +0xC0 0x00C0 # LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 # LATIN CAPITAL LETTER AE +0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 # LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x00CC # LATIN CAPITAL LETTER I WITH GRAVE +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x00D0 # LATIN CAPITAL LETTER ETH +0xD1 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x00D2 # LATIN CAPITAL LETTER O WITH GRAVE +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 # MULTIPLICATION SIGN +0xD8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x00D9 # LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x00DE # LATIN CAPITAL LETTER THORN +0xDF 0x00DF # LATIN SMALL LETTER SHARP S +0xE0 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 # LATIN SMALL LETTER AE +0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x00EC # LATIN SMALL LETTER I WITH GRAVE +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x00F0 # LATIN SMALL LETTER ETH +0xF1 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0xF2 0x00F2 # LATIN SMALL LETTER O WITH GRAVE +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 # DIVISION SIGN +0xF8 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xF9 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD # LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x00FE # LATIN SMALL LETTER THORN +0xFF 0x00FF # LATIN SMALL LETTER Y WITH DIAERESIS + diff --git a/charsets/8859-2.txt b/charsets/8859-2.txt new file mode 100644 index 0000000..0614739 --- /dev/null +++ b/charsets/8859-2.txt @@ -0,0 +1,230 @@ +# +# Name: ISO 8859-2 (1987) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-2 (1987) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-2 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-2 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x0104 # LATIN CAPITAL LETTER A WITH OGONEK +0xA2 0x02D8 # BREVE +0xA3 0x0141 # LATIN CAPITAL LETTER L WITH STROKE +0xA4 0x00A4 # CURRENCY SIGN +0xA5 0x013D # LATIN CAPITAL LETTER L WITH CARON +0xA6 0x015A # LATIN CAPITAL LETTER S WITH ACUTE +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00A8 # DIAERESIS +0xA9 0x0160 # LATIN CAPITAL LETTER S WITH CARON +0xAA 0x015E # LATIN CAPITAL LETTER S WITH CEDILLA +0xAB 0x0164 # LATIN CAPITAL LETTER T WITH CARON +0xAC 0x0179 # LATIN CAPITAL LETTER Z WITH ACUTE +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x017D # LATIN CAPITAL LETTER Z WITH CARON +0xAF 0x017B # LATIN CAPITAL LETTER Z WITH DOT ABOVE +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x0105 # LATIN SMALL LETTER A WITH OGONEK +0xB2 0x02DB # OGONEK +0xB3 0x0142 # LATIN SMALL LETTER L WITH STROKE +0xB4 0x00B4 # ACUTE ACCENT +0xB5 0x013E # LATIN SMALL LETTER L WITH CARON +0xB6 0x015B # LATIN SMALL LETTER S WITH ACUTE +0xB7 0x02C7 # CARON +0xB8 0x00B8 # CEDILLA +0xB9 0x0161 # LATIN SMALL LETTER S WITH CARON +0xBA 0x015F # LATIN SMALL LETTER S WITH CEDILLA +0xBB 0x0165 # LATIN SMALL LETTER T WITH CARON +0xBC 0x017A # LATIN SMALL LETTER Z WITH ACUTE +0xBD 0x02DD # DOUBLE ACUTE ACCENT +0xBE 0x017E # LATIN SMALL LETTER Z WITH CARON +0xBF 0x017C # LATIN SMALL LETTER Z WITH DOT ABOVE +0xC0 0x0154 # LATIN CAPITAL LETTER R WITH ACUTE +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x0102 # LATIN CAPITAL LETTER A WITH BREVE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x0139 # LATIN CAPITAL LETTER L WITH ACUTE +0xC6 0x0106 # LATIN CAPITAL LETTER C WITH ACUTE +0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x010C # LATIN CAPITAL LETTER C WITH CARON +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x0118 # LATIN CAPITAL LETTER E WITH OGONEK +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x011A # LATIN CAPITAL LETTER E WITH CARON +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x010E # LATIN CAPITAL LETTER D WITH CARON +0xD0 0x0110 # LATIN CAPITAL LETTER D WITH STROKE +0xD1 0x0143 # LATIN CAPITAL LETTER N WITH ACUTE +0xD2 0x0147 # LATIN CAPITAL LETTER N WITH CARON +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x0150 # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 # MULTIPLICATION SIGN +0xD8 0x0158 # LATIN CAPITAL LETTER R WITH CARON +0xD9 0x016E # LATIN CAPITAL LETTER U WITH RING ABOVE +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x0170 # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x0162 # LATIN CAPITAL LETTER T WITH CEDILLA +0xDF 0x00DF # LATIN SMALL LETTER SHARP S +0xE0 0x0155 # LATIN SMALL LETTER R WITH ACUTE +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x0103 # LATIN SMALL LETTER A WITH BREVE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x013A # LATIN SMALL LETTER L WITH ACUTE +0xE6 0x0107 # LATIN SMALL LETTER C WITH ACUTE +0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x010D # LATIN SMALL LETTER C WITH CARON +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x0119 # LATIN SMALL LETTER E WITH OGONEK +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x011B # LATIN SMALL LETTER E WITH CARON +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x010F # LATIN SMALL LETTER D WITH CARON +0xF0 0x0111 # LATIN SMALL LETTER D WITH STROKE +0xF1 0x0144 # LATIN SMALL LETTER N WITH ACUTE +0xF2 0x0148 # LATIN SMALL LETTER N WITH CARON +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x0151 # LATIN SMALL LETTER O WITH DOUBLE ACUTE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 # DIVISION SIGN +0xF8 0x0159 # LATIN SMALL LETTER R WITH CARON +0xF9 0x016F # LATIN SMALL LETTER U WITH RING ABOVE +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x0171 # LATIN SMALL LETTER U WITH DOUBLE ACUTE +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD # LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x0163 # LATIN SMALL LETTER T WITH CEDILLA +0xFF 0x02D9 # DOT ABOVE diff --git a/charsets/8859-3.txt b/charsets/8859-3.txt new file mode 100644 index 0000000..f9eeb86 --- /dev/null +++ b/charsets/8859-3.txt @@ -0,0 +1,223 @@ +# +# Name: ISO 8859-3 (1988) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-3 (1988) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-3 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-3 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x0126 # LATIN CAPITAL LETTER H WITH STROKE +0xA2 0x02D8 # BREVE +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A4 # CURRENCY SIGN +0xA6 0x0124 # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00A8 # DIAERESIS +0xA9 0x0130 # LATIN CAPITAL LETTER I WITH DOT ABOVE +0xAA 0x015E # LATIN CAPITAL LETTER S WITH CEDILLA +0xAB 0x011E # LATIN CAPITAL LETTER G WITH BREVE +0xAC 0x0134 # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0xAD 0x00AD # SOFT HYPHEN +0xAF 0x017B # LATIN CAPITAL LETTER Z WITH DOT ABOVE +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x0127 # LATIN SMALL LETTER H WITH STROKE +0xB2 0x00B2 # SUPERSCRIPT TWO +0xB3 0x00B3 # SUPERSCRIPT THREE +0xB4 0x00B4 # ACUTE ACCENT +0xB5 0x00B5 # MICRO SIGN +0xB6 0x0125 # LATIN SMALL LETTER H WITH CIRCUMFLEX +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x00B8 # CEDILLA +0xB9 0x0131 # LATIN SMALL LETTER DOTLESS I +0xBA 0x015F # LATIN SMALL LETTER S WITH CEDILLA +0xBB 0x011F # LATIN SMALL LETTER G WITH BREVE +0xBC 0x0135 # LATIN SMALL LETTER J WITH CIRCUMFLEX +0xBD 0x00BD # VULGAR FRACTION ONE HALF +0xBF 0x017C # LATIN SMALL LETTER Z WITH DOT ABOVE +0xC0 0x00C0 # LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x010A # LATIN CAPITAL LETTER C WITH DOT ABOVE +0xC6 0x0108 # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 # LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x00CC # LATIN CAPITAL LETTER I WITH GRAVE +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS +0xD1 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x00D2 # LATIN CAPITAL LETTER O WITH GRAVE +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x0120 # LATIN CAPITAL LETTER G WITH DOT ABOVE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 # MULTIPLICATION SIGN +0xD8 0x011C # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +0xD9 0x00D9 # LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x016C # LATIN CAPITAL LETTER U WITH BREVE +0xDE 0x015C # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +0xDF 0x00DF # LATIN SMALL LETTER SHARP S +0xE0 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x010B # LATIN SMALL LETTER C WITH DOT ABOVE +0xE6 0x0109 # LATIN SMALL LETTER C WITH CIRCUMFLEX +0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x00EC # LATIN SMALL LETTER I WITH GRAVE +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0xF1 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0xF2 0x00F2 # LATIN SMALL LETTER O WITH GRAVE +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x0121 # LATIN SMALL LETTER G WITH DOT ABOVE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 # DIVISION SIGN +0xF8 0x011D # LATIN SMALL LETTER G WITH CIRCUMFLEX +0xF9 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x016D # LATIN SMALL LETTER U WITH BREVE +0xFE 0x015D # LATIN SMALL LETTER S WITH CIRCUMFLEX +0xFF 0x02D9 # DOT ABOVE diff --git a/charsets/8859-4.txt b/charsets/8859-4.txt new file mode 100644 index 0000000..8b72ad8 --- /dev/null +++ b/charsets/8859-4.txt @@ -0,0 +1,230 @@ +# +# Name: ISO 8859-4 (1988) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-4 (1988) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-4 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-4 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x0104 # LATIN CAPITAL LETTER A WITH OGONEK +0xA2 0x0138 # LATIN SMALL LETTER KRA +0xA3 0x0156 # LATIN CAPITAL LETTER R WITH CEDILLA +0xA4 0x00A4 # CURRENCY SIGN +0xA5 0x0128 # LATIN CAPITAL LETTER I WITH TILDE +0xA6 0x013B # LATIN CAPITAL LETTER L WITH CEDILLA +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00A8 # DIAERESIS +0xA9 0x0160 # LATIN CAPITAL LETTER S WITH CARON +0xAA 0x0112 # LATIN CAPITAL LETTER E WITH MACRON +0xAB 0x0122 # LATIN CAPITAL LETTER G WITH CEDILLA +0xAC 0x0166 # LATIN CAPITAL LETTER T WITH STROKE +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x017D # LATIN CAPITAL LETTER Z WITH CARON +0xAF 0x00AF # MACRON +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x0105 # LATIN SMALL LETTER A WITH OGONEK +0xB2 0x02DB # OGONEK +0xB3 0x0157 # LATIN SMALL LETTER R WITH CEDILLA +0xB4 0x00B4 # ACUTE ACCENT +0xB5 0x0129 # LATIN SMALL LETTER I WITH TILDE +0xB6 0x013C # LATIN SMALL LETTER L WITH CEDILLA +0xB7 0x02C7 # CARON +0xB8 0x00B8 # CEDILLA +0xB9 0x0161 # LATIN SMALL LETTER S WITH CARON +0xBA 0x0113 # LATIN SMALL LETTER E WITH MACRON +0xBB 0x0123 # LATIN SMALL LETTER G WITH CEDILLA +0xBC 0x0167 # LATIN SMALL LETTER T WITH STROKE +0xBD 0x014A # LATIN CAPITAL LETTER ENG +0xBE 0x017E # LATIN SMALL LETTER Z WITH CARON +0xBF 0x014B # LATIN SMALL LETTER ENG +0xC0 0x0100 # LATIN CAPITAL LETTER A WITH MACRON +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 # LATIN CAPITAL LETTER AE +0xC7 0x012E # LATIN CAPITAL LETTER I WITH OGONEK +0xC8 0x010C # LATIN CAPITAL LETTER C WITH CARON +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x0118 # LATIN CAPITAL LETTER E WITH OGONEK +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x0116 # LATIN CAPITAL LETTER E WITH DOT ABOVE +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x012A # LATIN CAPITAL LETTER I WITH MACRON +0xD0 0x0110 # LATIN CAPITAL LETTER D WITH STROKE +0xD1 0x0145 # LATIN CAPITAL LETTER N WITH CEDILLA +0xD2 0x014C # LATIN CAPITAL LETTER O WITH MACRON +0xD3 0x0136 # LATIN CAPITAL LETTER K WITH CEDILLA +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 # MULTIPLICATION SIGN +0xD8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x0172 # LATIN CAPITAL LETTER U WITH OGONEK +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x0168 # LATIN CAPITAL LETTER U WITH TILDE +0xDE 0x016A # LATIN CAPITAL LETTER U WITH MACRON +0xDF 0x00DF # LATIN SMALL LETTER SHARP S +0xE0 0x0101 # LATIN SMALL LETTER A WITH MACRON +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 # LATIN SMALL LETTER AE +0xE7 0x012F # LATIN SMALL LETTER I WITH OGONEK +0xE8 0x010D # LATIN SMALL LETTER C WITH CARON +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x0119 # LATIN SMALL LETTER E WITH OGONEK +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x0117 # LATIN SMALL LETTER E WITH DOT ABOVE +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x012B # LATIN SMALL LETTER I WITH MACRON +0xF0 0x0111 # LATIN SMALL LETTER D WITH STROKE +0xF1 0x0146 # LATIN SMALL LETTER N WITH CEDILLA +0xF2 0x014D # LATIN SMALL LETTER O WITH MACRON +0xF3 0x0137 # LATIN SMALL LETTER K WITH CEDILLA +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 # DIVISION SIGN +0xF8 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xF9 0x0173 # LATIN SMALL LETTER U WITH OGONEK +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x0169 # LATIN SMALL LETTER U WITH TILDE +0xFE 0x016B # LATIN SMALL LETTER U WITH MACRON +0xFF 0x02D9 # DOT ABOVE diff --git a/charsets/8859-5.txt b/charsets/8859-5.txt new file mode 100644 index 0000000..9783092 --- /dev/null +++ b/charsets/8859-5.txt @@ -0,0 +1,230 @@ +# +# Name: ISO 8859-5 (1988) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-5 (1988) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-5 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-5 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x0401 # CYRILLIC CAPITAL LETTER IO +0xA2 0x0402 # CYRILLIC CAPITAL LETTER DJE +0xA3 0x0403 # CYRILLIC CAPITAL LETTER GJE +0xA4 0x0404 # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0xA5 0x0405 # CYRILLIC CAPITAL LETTER DZE +0xA6 0x0406 # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0xA7 0x0407 # CYRILLIC CAPITAL LETTER YI +0xA8 0x0408 # CYRILLIC CAPITAL LETTER JE +0xA9 0x0409 # CYRILLIC CAPITAL LETTER LJE +0xAA 0x040A # CYRILLIC CAPITAL LETTER NJE +0xAB 0x040B # CYRILLIC CAPITAL LETTER TSHE +0xAC 0x040C # CYRILLIC CAPITAL LETTER KJE +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x040E # CYRILLIC CAPITAL LETTER SHORT U +0xAF 0x040F # CYRILLIC CAPITAL LETTER DZHE +0xB0 0x0410 # CYRILLIC CAPITAL LETTER A +0xB1 0x0411 # CYRILLIC CAPITAL LETTER BE +0xB2 0x0412 # CYRILLIC CAPITAL LETTER VE +0xB3 0x0413 # CYRILLIC CAPITAL LETTER GHE +0xB4 0x0414 # CYRILLIC CAPITAL LETTER DE +0xB5 0x0415 # CYRILLIC CAPITAL LETTER IE +0xB6 0x0416 # CYRILLIC CAPITAL LETTER ZHE +0xB7 0x0417 # CYRILLIC CAPITAL LETTER ZE +0xB8 0x0418 # CYRILLIC CAPITAL LETTER I +0xB9 0x0419 # CYRILLIC CAPITAL LETTER SHORT I +0xBA 0x041A # CYRILLIC CAPITAL LETTER KA +0xBB 0x041B # CYRILLIC CAPITAL LETTER EL +0xBC 0x041C # CYRILLIC CAPITAL LETTER EM +0xBD 0x041D # CYRILLIC CAPITAL LETTER EN +0xBE 0x041E # CYRILLIC CAPITAL LETTER O +0xBF 0x041F # CYRILLIC CAPITAL LETTER PE +0xC0 0x0420 # CYRILLIC CAPITAL LETTER ER +0xC1 0x0421 # CYRILLIC CAPITAL LETTER ES +0xC2 0x0422 # CYRILLIC CAPITAL LETTER TE +0xC3 0x0423 # CYRILLIC CAPITAL LETTER U +0xC4 0x0424 # CYRILLIC CAPITAL LETTER EF +0xC5 0x0425 # CYRILLIC CAPITAL LETTER HA +0xC6 0x0426 # CYRILLIC CAPITAL LETTER TSE +0xC7 0x0427 # CYRILLIC CAPITAL LETTER CHE +0xC8 0x0428 # CYRILLIC CAPITAL LETTER SHA +0xC9 0x0429 # CYRILLIC CAPITAL LETTER SHCHA +0xCA 0x042A # CYRILLIC CAPITAL LETTER HARD SIGN +0xCB 0x042B # CYRILLIC CAPITAL LETTER YERU +0xCC 0x042C # CYRILLIC CAPITAL LETTER SOFT SIGN +0xCD 0x042D # CYRILLIC CAPITAL LETTER E +0xCE 0x042E # CYRILLIC CAPITAL LETTER YU +0xCF 0x042F # CYRILLIC CAPITAL LETTER YA +0xD0 0x0430 # CYRILLIC SMALL LETTER A +0xD1 0x0431 # CYRILLIC SMALL LETTER BE +0xD2 0x0432 # CYRILLIC SMALL LETTER VE +0xD3 0x0433 # CYRILLIC SMALL LETTER GHE +0xD4 0x0434 # CYRILLIC SMALL LETTER DE +0xD5 0x0435 # CYRILLIC SMALL LETTER IE +0xD6 0x0436 # CYRILLIC SMALL LETTER ZHE +0xD7 0x0437 # CYRILLIC SMALL LETTER ZE +0xD8 0x0438 # CYRILLIC SMALL LETTER I +0xD9 0x0439 # CYRILLIC SMALL LETTER SHORT I +0xDA 0x043A # CYRILLIC SMALL LETTER KA +0xDB 0x043B # CYRILLIC SMALL LETTER EL +0xDC 0x043C # CYRILLIC SMALL LETTER EM +0xDD 0x043D # CYRILLIC SMALL LETTER EN +0xDE 0x043E # CYRILLIC SMALL LETTER O +0xDF 0x043F # CYRILLIC SMALL LETTER PE +0xE0 0x0440 # CYRILLIC SMALL LETTER ER +0xE1 0x0441 # CYRILLIC SMALL LETTER ES +0xE2 0x0442 # CYRILLIC SMALL LETTER TE +0xE3 0x0443 # CYRILLIC SMALL LETTER U +0xE4 0x0444 # CYRILLIC SMALL LETTER EF +0xE5 0x0445 # CYRILLIC SMALL LETTER HA +0xE6 0x0446 # CYRILLIC SMALL LETTER TSE +0xE7 0x0447 # CYRILLIC SMALL LETTER CHE +0xE8 0x0448 # CYRILLIC SMALL LETTER SHA +0xE9 0x0449 # CYRILLIC SMALL LETTER SHCHA +0xEA 0x044A # CYRILLIC SMALL LETTER HARD SIGN +0xEB 0x044B # CYRILLIC SMALL LETTER YERU +0xEC 0x044C # CYRILLIC SMALL LETTER SOFT SIGN +0xED 0x044D # CYRILLIC SMALL LETTER E +0xEE 0x044E # CYRILLIC SMALL LETTER YU +0xEF 0x044F # CYRILLIC SMALL LETTER YA +0xF0 0x2116 # NUMERO SIGN +0xF1 0x0451 # CYRILLIC SMALL LETTER IO +0xF2 0x0452 # CYRILLIC SMALL LETTER DJE +0xF3 0x0453 # CYRILLIC SMALL LETTER GJE +0xF4 0x0454 # CYRILLIC SMALL LETTER UKRAINIAN IE +0xF5 0x0455 # CYRILLIC SMALL LETTER DZE +0xF6 0x0456 # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +0xF7 0x0457 # CYRILLIC SMALL LETTER YI +0xF8 0x0458 # CYRILLIC SMALL LETTER JE +0xF9 0x0459 # CYRILLIC SMALL LETTER LJE +0xFA 0x045A # CYRILLIC SMALL LETTER NJE +0xFB 0x045B # CYRILLIC SMALL LETTER TSHE +0xFC 0x045C # CYRILLIC SMALL LETTER KJE +0xFD 0x00A7 # SECTION SIGN +0xFE 0x045E # CYRILLIC SMALL LETTER SHORT U +0xFF 0x045F # CYRILLIC SMALL LETTER DZHE diff --git a/charsets/8859-6.txt b/charsets/8859-6.txt new file mode 100644 index 0000000..f15efdd --- /dev/null +++ b/charsets/8859-6.txt @@ -0,0 +1,185 @@ +# +# Name: ISO 8859-6 (1987) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-6 (1987) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-6 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-6 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0660 # ARABIC-INDIC DIGIT ZERO +0x31 0x0661 # ARABIC-INDIC DIGIT ONE +0x32 0x0662 # ARABIC-INDIC DIGIT TWO +0x33 0x0663 # ARABIC-INDIC DIGIT THREE +0x34 0x0664 # ARABIC-INDIC DIGIT FOUR +0x35 0x0665 # ARABIC-INDIC DIGIT FIVE +0x36 0x0666 # ARABIC-INDIC DIGIT SIX +0x37 0x0667 # ARABIC-INDIC DIGIT SEVEN +0x38 0x0668 # ARABIC-INDIC DIGIT EIGHT +0x39 0x0669 # ARABIC-INDIC DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA4 0x00A4 # CURRENCY SIGN +0xAC 0x060C # ARABIC COMMA +0xAD 0x00AD # SOFT HYPHEN +0xBB 0x061B # ARABIC SEMICOLON +0xBF 0x061F # ARABIC QUESTION MARK +0xC1 0x0621 # ARABIC LETTER HAMZA +0xC2 0x0622 # ARABIC LETTER ALEF WITH MADDA ABOVE +0xC3 0x0623 # ARABIC LETTER ALEF WITH HAMZA ABOVE +0xC4 0x0624 # ARABIC LETTER WAW WITH HAMZA ABOVE +0xC5 0x0625 # ARABIC LETTER ALEF WITH HAMZA BELOW +0xC6 0x0626 # ARABIC LETTER YEH WITH HAMZA ABOVE +0xC7 0x0627 # ARABIC LETTER ALEF +0xC8 0x0628 # ARABIC LETTER BEH +0xC9 0x0629 # ARABIC LETTER TEH MARBUTA +0xCA 0x062A # ARABIC LETTER TEH +0xCB 0x062B # ARABIC LETTER THEH +0xCC 0x062C # ARABIC LETTER JEEM +0xCD 0x062D # ARABIC LETTER HAH +0xCE 0x062E # ARABIC LETTER KHAH +0xCF 0x062F # ARABIC LETTER DAL +0xD0 0x0630 # ARABIC LETTER THAL +0xD1 0x0631 # ARABIC LETTER REH +0xD2 0x0632 # ARABIC LETTER ZAIN +0xD3 0x0633 # ARABIC LETTER SEEN +0xD4 0x0634 # ARABIC LETTER SHEEN +0xD5 0x0635 # ARABIC LETTER SAD +0xD6 0x0636 # ARABIC LETTER DAD +0xD7 0x0637 # ARABIC LETTER TAH +0xD8 0x0638 # ARABIC LETTER ZAH +0xD9 0x0639 # ARABIC LETTER AIN +0xDA 0x063A # ARABIC LETTER GHAIN +0xE0 0x0640 # ARABIC TATWEEL +0xE1 0x0641 # ARABIC LETTER FEH +0xE2 0x0642 # ARABIC LETTER QAF +0xE3 0x0643 # ARABIC LETTER KAF +0xE4 0x0644 # ARABIC LETTER LAM +0xE5 0x0645 # ARABIC LETTER MEEM +0xE6 0x0646 # ARABIC LETTER NOON +0xE7 0x0647 # ARABIC LETTER HEH +0xE8 0x0648 # ARABIC LETTER WAW +0xE9 0x0649 # ARABIC LETTER ALEF MAKSURA +0xEA 0x064A # ARABIC LETTER YEH +0xEB 0x064B # ARABIC FATHATAN +0xEC 0x064C # ARABIC DAMMATAN +0xED 0x064D # ARABIC KASRATAN +0xEE 0x064E # ARABIC FATHA +0xEF 0x064F # ARABIC DAMMA +0xF0 0x0650 # ARABIC KASRA +0xF1 0x0651 # ARABIC SHADDA +0xF2 0x0652 # ARABIC SUKUN diff --git a/charsets/8859-7.txt b/charsets/8859-7.txt new file mode 100644 index 0000000..499cd31 --- /dev/null +++ b/charsets/8859-7.txt @@ -0,0 +1,224 @@ +# +# Name: ISO 8859-7 (1987) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-7 (1987) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-7 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-7 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x02BD # MODIFIER LETTER REVERSED COMMA +0xA2 0x02BC # MODIFIER LETTER APOSTROPHE +0xA3 0x00A3 # POUND SIGN +0xA6 0x00A6 # BROKEN BAR +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00A8 # DIAERESIS +0xA9 0x00A9 # COPYRIGHT SIGN +0xAB 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC # NOT SIGN +0xAD 0x00AD # SOFT HYPHEN +0xAF 0x2015 # HORIZONTAL BAR +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x00B2 # SUPERSCRIPT TWO +0xB3 0x00B3 # SUPERSCRIPT THREE +0xB4 0x0384 # GREEK TONOS +0xB5 0x0385 # GREEK DIALYTIKA TONOS +0xB6 0x0386 # GREEK CAPITAL LETTER ALPHA WITH TONOS +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x0388 # GREEK CAPITAL LETTER EPSILON WITH TONOS +0xB9 0x0389 # GREEK CAPITAL LETTER ETA WITH TONOS +0xBA 0x038A # GREEK CAPITAL LETTER IOTA WITH TONOS +0xBB 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x038C # GREEK CAPITAL LETTER OMICRON WITH TONOS +0xBD 0x00BD # VULGAR FRACTION ONE HALF +0xBE 0x038E # GREEK CAPITAL LETTER UPSILON WITH TONOS +0xBF 0x038F # GREEK CAPITAL LETTER OMEGA WITH TONOS +0xC0 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0xC1 0x0391 # GREEK CAPITAL LETTER ALPHA +0xC2 0x0392 # GREEK CAPITAL LETTER BETA +0xC3 0x0393 # GREEK CAPITAL LETTER GAMMA +0xC4 0x0394 # GREEK CAPITAL LETTER DELTA +0xC5 0x0395 # GREEK CAPITAL LETTER EPSILON +0xC6 0x0396 # GREEK CAPITAL LETTER ZETA +0xC7 0x0397 # GREEK CAPITAL LETTER ETA +0xC8 0x0398 # GREEK CAPITAL LETTER THETA +0xC9 0x0399 # GREEK CAPITAL LETTER IOTA +0xCA 0x039A # GREEK CAPITAL LETTER KAPPA +0xCB 0x039B # GREEK CAPITAL LETTER LAMDA +0xCC 0x039C # GREEK CAPITAL LETTER MU +0xCD 0x039D # GREEK CAPITAL LETTER NU +0xCE 0x039E # GREEK CAPITAL LETTER XI +0xCF 0x039F # GREEK CAPITAL LETTER OMICRON +0xD0 0x03A0 # GREEK CAPITAL LETTER PI +0xD1 0x03A1 # GREEK CAPITAL LETTER RHO +0xD3 0x03A3 # GREEK CAPITAL LETTER SIGMA +0xD4 0x03A4 # GREEK CAPITAL LETTER TAU +0xD5 0x03A5 # GREEK CAPITAL LETTER UPSILON +0xD6 0x03A6 # GREEK CAPITAL LETTER PHI +0xD7 0x03A7 # GREEK CAPITAL LETTER CHI +0xD8 0x03A8 # GREEK CAPITAL LETTER PSI +0xD9 0x03A9 # GREEK CAPITAL LETTER OMEGA +0xDA 0x03AA # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +0xDB 0x03AB # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +0xDC 0x03AC # GREEK SMALL LETTER ALPHA WITH TONOS +0xDD 0x03AD # GREEK SMALL LETTER EPSILON WITH TONOS +0xDE 0x03AE # GREEK SMALL LETTER ETA WITH TONOS +0xDF 0x03AF # GREEK SMALL LETTER IOTA WITH TONOS +0xE0 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +0xE1 0x03B1 # GREEK SMALL LETTER ALPHA +0xE2 0x03B2 # GREEK SMALL LETTER BETA +0xE3 0x03B3 # GREEK SMALL LETTER GAMMA +0xE4 0x03B4 # GREEK SMALL LETTER DELTA +0xE5 0x03B5 # GREEK SMALL LETTER EPSILON +0xE6 0x03B6 # GREEK SMALL LETTER ZETA +0xE7 0x03B7 # GREEK SMALL LETTER ETA +0xE8 0x03B8 # GREEK SMALL LETTER THETA +0xE9 0x03B9 # GREEK SMALL LETTER IOTA +0xEA 0x03BA # GREEK SMALL LETTER KAPPA +0xEB 0x03BB # GREEK SMALL LETTER LAMDA +0xEC 0x03BC # GREEK SMALL LETTER MU +0xED 0x03BD # GREEK SMALL LETTER NU +0xEE 0x03BE # GREEK SMALL LETTER XI +0xEF 0x03BF # GREEK SMALL LETTER OMICRON +0xF0 0x03C0 # GREEK SMALL LETTER PI +0xF1 0x03C1 # GREEK SMALL LETTER RHO +0xF2 0x03C2 # GREEK SMALL LETTER FINAL SIGMA +0xF3 0x03C3 # GREEK SMALL LETTER SIGMA +0xF4 0x03C4 # GREEK SMALL LETTER TAU +0xF5 0x03C5 # GREEK SMALL LETTER UPSILON +0xF6 0x03C6 # GREEK SMALL LETTER PHI +0xF7 0x03C7 # GREEK SMALL LETTER CHI +0xF8 0x03C8 # GREEK SMALL LETTER PSI +0xF9 0x03C9 # GREEK SMALL LETTER OMEGA +0xFA 0x03CA # GREEK SMALL LETTER IOTA WITH DIALYTIKA +0xFB 0x03CB # GREEK SMALL LETTER UPSILON WITH DIALYTIKA +0xFC 0x03CC # GREEK SMALL LETTER OMICRON WITH TONOS +0xFD 0x03CD # GREEK SMALL LETTER UPSILON WITH TONOS +0xFE 0x03CE # GREEK SMALL LETTER OMEGA WITH TONOS diff --git a/charsets/8859-8.txt b/charsets/8859-8.txt new file mode 100644 index 0000000..347f567 --- /dev/null +++ b/charsets/8859-8.txt @@ -0,0 +1,192 @@ +# +# Name: ISO 8859-8 (1988) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-8 (1988) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-8 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-8 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA2 0x00A2 # CENT SIGN +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A4 # CURRENCY SIGN +0xA5 0x00A5 # YEN SIGN +0xA6 0x00A6 # BROKEN BAR +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00A8 # DIAERESIS +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x00D7 # MULTIPLICATION SIGN +0xAB 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC # NOT SIGN +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x00AE # REGISTERED SIGN +0xAF 0x203E # OVERLINE +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x00B2 # SUPERSCRIPT TWO +0xB3 0x00B3 # SUPERSCRIPT THREE +0xB4 0x00B4 # ACUTE ACCENT +0xB5 0x00B5 # MICRO SIGN +0xB6 0x00B6 # PILCROW SIGN +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x00B8 # CEDILLA +0xB9 0x00B9 # SUPERSCRIPT ONE +0xBA 0x00F7 # DIVISION SIGN +0xBB 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC # VULGAR FRACTION ONE QUARTER +0xBD 0x00BD # VULGAR FRACTION ONE HALF +0xBE 0x00BE # VULGAR FRACTION THREE QUARTERS +0xDF 0x2017 # DOUBLE LOW LINE +0xE0 0x05D0 # HEBREW LETTER ALEF +0xE1 0x05D1 # HEBREW LETTER BET +0xE2 0x05D2 # HEBREW LETTER GIMEL +0xE3 0x05D3 # HEBREW LETTER DALET +0xE4 0x05D4 # HEBREW LETTER HE +0xE5 0x05D5 # HEBREW LETTER VAV +0xE6 0x05D6 # HEBREW LETTER ZAYIN +0xE7 0x05D7 # HEBREW LETTER HET +0xE8 0x05D8 # HEBREW LETTER TET +0xE9 0x05D9 # HEBREW LETTER YOD +0xEA 0x05DA # HEBREW LETTER FINAL KAF +0xEB 0x05DB # HEBREW LETTER KAF +0xEC 0x05DC # HEBREW LETTER LAMED +0xED 0x05DD # HEBREW LETTER FINAL MEM +0xEE 0x05DE # HEBREW LETTER MEM +0xEF 0x05DF # HEBREW LETTER FINAL NUN +0xF0 0x05E0 # HEBREW LETTER NUN +0xF1 0x05E1 # HEBREW LETTER SAMEKH +0xF2 0x05E2 # HEBREW LETTER AYIN +0xF3 0x05E3 # HEBREW LETTER FINAL PE +0xF4 0x05E4 # HEBREW LETTER PE +0xF5 0x05E5 # HEBREW LETTER FINAL TSADI +0xF6 0x05E6 # HEBREW LETTER TSADI +0xF7 0x05E7 # HEBREW LETTER QOF +0xF8 0x05E8 # HEBREW LETTER RESH +0xF9 0x05E9 # HEBREW LETTER SHIN +0xFA 0x05EA # HEBREW LETTER TAV diff --git a/charsets/8859-9.txt b/charsets/8859-9.txt new file mode 100644 index 0000000..ef994d2 --- /dev/null +++ b/charsets/8859-9.txt @@ -0,0 +1,232 @@ +# +# Name: ISO 8859-9 (1989) to Unicode +# Unicode version: 1.1 +# Table version: 0.1 +# Table format: Format A +# Date: 16 January 1995 +# Authors: Tim Greenwood +# John H. Jenkins +# +# Copyright (c) 1991-1995 Unicode, Inc. All Rights reserved. +# +# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). +# No claims are made as to fitness for any particular purpose. No +# warranties of any kind are expressed or implied. The recipient +# agrees to determine applicability of information provided. If this +# file has been provided on magnetic media by Unicode, Inc., the sole +# remedy for any claim will be exchange of defective media within 90 +# days of receipt. +# +# Recipient is granted the right to make copies in any form for +# internal distribution and to freely use the information supplied +# in the creation of products supporting Unicode. Unicode, Inc. +# specifically excludes the right to re-distribute this file directly +# to third parties or other organizations whether for profit or not. +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO 8859-9 (1989) characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO 8859-9 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO 8859-9 order +# +# Any comments or problems, contact +# +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x00A1 # INVERTED EXCLAMATION MARK +0xA2 0x00A2 # CENT SIGN +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A4 # CURRENCY SIGN +0xA5 0x00A5 # YEN SIGN +0xA6 0x00A6 # BROKEN BAR +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00A8 # DIAERESIS +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x00AA # FEMININE ORDINAL INDICATOR +0xAB 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC # NOT SIGN +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x00AE # REGISTERED SIGN +0xAF 0x00AF # MACRON +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x00B2 # SUPERSCRIPT TWO +0xB3 0x00B3 # SUPERSCRIPT THREE +0xB4 0x00B4 # ACUTE ACCENT +0xB5 0x00B5 # MICRO SIGN +0xB6 0x00B6 # PILCROW SIGN +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x00B8 # CEDILLA +0xB9 0x00B9 # SUPERSCRIPT ONE +0xBA 0x00BA # MASCULINE ORDINAL INDICATOR +0xBB 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC # VULGAR FRACTION ONE QUARTER +0xBD 0x00BD # VULGAR FRACTION ONE HALF +0xBE 0x00BE # VULGAR FRACTION THREE QUARTERS +0xBF 0x00BF # INVERTED QUESTION MARK +0xC0 0x00C0 # LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 # LATIN CAPITAL LETTER AE +0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 # LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x00CC # LATIN CAPITAL LETTER I WITH GRAVE +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x011E # LATIN CAPITAL LETTER G WITH BREVE +0xD1 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x00D2 # LATIN CAPITAL LETTER O WITH GRAVE +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 # MULTIPLICATION SIGN +0xD8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x00D9 # LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x0130 # LATIN CAPITAL LETTER I WITH DOT ABOVE +0xDE 0x015E # LATIN CAPITAL LETTER S WITH CEDILLA +0xDF 0x00DF # LATIN SMALL LETTER SHARP S +0xE0 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 # LATIN SMALL LETTER AE +0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x00EC # LATIN SMALL LETTER I WITH GRAVE +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x011F # LATIN SMALL LETTER G WITH BREVE +0xF1 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0xF2 0x00F2 # LATIN SMALL LETTER O WITH GRAVE +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 # DIVISION SIGN +0xF8 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xF9 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x0131 # LATIN SMALL LETTER DOTLESS I +0xFE 0x015F # LATIN SMALL LETTER S WITH CEDILLA +0xFF 0x00FF # LATIN SMALL LETTER Y WITH DIAERESIS + + diff --git a/charsets/Makefile.in b/charsets/Makefile.in new file mode 100644 index 0000000..4c04869 --- /dev/null +++ b/charsets/Makefile.in @@ -0,0 +1,43 @@ +SHELL = /bin/sh +INSTALL = @INSTALL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +installroot = @installroot@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +REPL_EXT=@replsuffix@ +SPEC_EXT=@specsuffix@ +LIB_DIR=@datadir@/catdoc + +all: @charsetcheck@ +clean: +install: install-dirs install-spc install-charsets +install-spc: tex.spc ascii.spc tex.rpl ascii.rpl + $(INSTALL) -m 644 ascii.spc $(installroot)$(LIB_DIR)/ascii$(SPEC_EXT) + $(INSTALL) -m 644 tex.spc $(installroot)$(LIB_DIR)/tex$(SPEC_EXT) + $(INSTALL) -m 644 ascii.rpl $(installroot)$(LIB_DIR)/ascii$(REPL_EXT) + $(INSTALL) -m 644 tex.rpl $(installroot)$(LIB_DIR)/tex$(REPL_EXT) +install-charsets: + for i in *.txt; do\ + $(INSTALL) -m 0644 $$i $(installroot)$(LIB_DIR);\ + done +install-dirs: + ../mkinstalldirs $(installroot)$(LIB_DIR) +distclean: + rm Makefile diff --git a/charsets/ascii.rpl b/charsets/ascii.rpl new file mode 100644 index 0000000..3950431 --- /dev/null +++ b/charsets/ascii.rpl @@ -0,0 +1,162 @@ +00A0 " " +00A1 "!" +00A2 "c" +00A3 "F" +00A4 "r" +00A5 "Y" +00A6 "|" +00A7 " " +00A9 "(c)" +00AB '\"' +00AC '~' +00AD "" +00AE "(R)" +00B1 "+-" +00BB '\"' +00BC 1/4 +00BD 1/2 +00BE 3/4 +00BF ? +00C0 A +00C1 A +00C2 A +00C3 A +00C4 Ae +00C5 A +00C6 AE +00C7 C +00C8 E +00C9 E +00CA E +00CB E +00CC I +00CD I +00CE I +00CF I +00D1 N +00D2 O +00D3 O +00D4 O +00D5 O +00D6 Oe +00D7 * +00D8 O +00D9 U +00DA U +00DB U +00DC Ue +00DD Y +00DE TH +00DF ss +00E0 a +00E1 a +00E2 a +00E3 a +00E4 ae +00E5 a +00E6 ae +00E7 c +00E8 e +00E9 e +00EA e +00EB e +00EC i +00ED i +00EE i +00EF i +00F1 n +00F2 o +00F3 o +00F4 o +00F5 o +00F6 oe +00F7 / +00F8 o +00F9 u +00FA u +00FB u +00FC ue +00FD y +00FE th +00FF "y" +0410 A +0411 B +0412 W +0413 G +0414 D +0415 E +0416 ZH +0417 Z +0418 I +0419 Y +041a K +041b L +041c M +041d N +041e O +041f P +0420 R +0421 S +0422 T +0423 U +0424 F +0425 KH +0426 TZ +0427 CH +0428 SH +0429 SCH +042a "'" +042b Y +042c "'" +042d E +042e YU +042f YA +0430 a +0431 b +0432 w +0433 g +0434 d +0435 e +0436 zh +0437 z +0438 i +0439 y +043a k +043b l +043c m +043d n +043e o +043f p +0440 r +0441 s +0442 t +0443 u +0444 f +0445 kh +0446 tz +0447 ch +0448 sh +0449 sch +044a "'" +044b y +044c "'" +044d e +044e yu +044f ya +0401 YO +0451 yo +201A "'" +201E "''" +2030 "o/oo" +2039 "\"" +2018 "`" +2019 "'" +201C "``" +201D "''" +2022 "*" +2013 "-" +2014 " - " +2122 "tm" +203A "\"" +2116 "No" +2026 "..." diff --git a/charsets/ascii.spc b/charsets/ascii.spc new file mode 100644 index 0000000..66cfbe9 --- /dev/null +++ b/charsets/ascii.spc @@ -0,0 +1,3 @@ +001C "\t" +001E "\n" +00AD "" diff --git a/charsets/cp1250.txt b/charsets/cp1250.txt new file mode 100644 index 0000000..860ec3c --- /dev/null +++ b/charsets/cp1250.txt @@ -0,0 +1,274 @@ +# +# Name: cp1250 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1250 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1250 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 #UNDEFINED +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 #UNDEFINED +0x89 0x2030 #PER MILLE SIGN +0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C 0x015A #LATIN CAPITAL LETTER S WITH ACUTE +0x8D 0x0164 #LATIN CAPITAL LETTER T WITH CARON +0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON +0x8F 0x0179 #LATIN CAPITAL LETTER Z WITH ACUTE +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 #UNDEFINED +0x99 0x2122 #TRADE MARK SIGN +0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C 0x015B #LATIN SMALL LETTER S WITH ACUTE +0x9D 0x0165 #LATIN SMALL LETTER T WITH CARON +0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON +0x9F 0x017A #LATIN SMALL LETTER Z WITH ACUTE +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x02C7 #CARON +0xA2 0x02D8 #BREVE +0xA3 0x0141 #LATIN CAPITAL LETTER L WITH STROKE +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x0104 #LATIN CAPITAL LETTER A WITH OGONEK +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00A8 #DIAERESIS +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x015E #LATIN CAPITAL LETTER S WITH CEDILLA +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x017B #LATIN CAPITAL LETTER Z WITH DOT ABOVE +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x02DB #OGONEK +0xB3 0x0142 #LATIN SMALL LETTER L WITH STROKE +0xB4 0x00B4 #ACUTE ACCENT +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x00B8 #CEDILLA +0xB9 0x0105 #LATIN SMALL LETTER A WITH OGONEK +0xBA 0x015F #LATIN SMALL LETTER S WITH CEDILLA +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x013D #LATIN CAPITAL LETTER L WITH CARON +0xBD 0x02DD #DOUBLE ACUTE ACCENT +0xBE 0x013E #LATIN SMALL LETTER L WITH CARON +0xBF 0x017C #LATIN SMALL LETTER Z WITH DOT ABOVE +0xC0 0x0154 #LATIN CAPITAL LETTER R WITH ACUTE +0xC1 0x00C1 #LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x0102 #LATIN CAPITAL LETTER A WITH BREVE +0xC4 0x00C4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x0139 #LATIN CAPITAL LETTER L WITH ACUTE +0xC6 0x0106 #LATIN CAPITAL LETTER C WITH ACUTE +0xC7 0x00C7 #LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x010C #LATIN CAPITAL LETTER C WITH CARON +0xC9 0x00C9 #LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x0118 #LATIN CAPITAL LETTER E WITH OGONEK +0xCB 0x00CB #LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x011A #LATIN CAPITAL LETTER E WITH CARON +0xCD 0x00CD #LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x010E #LATIN CAPITAL LETTER D WITH CARON +0xD0 0x0110 #LATIN CAPITAL LETTER D WITH STROKE +0xD1 0x0143 #LATIN CAPITAL LETTER N WITH ACUTE +0xD2 0x0147 #LATIN CAPITAL LETTER N WITH CARON +0xD3 0x00D3 #LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x0150 #LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0xD6 0x00D6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 #MULTIPLICATION SIGN +0xD8 0x0158 #LATIN CAPITAL LETTER R WITH CARON +0xD9 0x016E #LATIN CAPITAL LETTER U WITH RING ABOVE +0xDA 0x00DA #LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x0170 #LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0xDC 0x00DC #LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD #LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x0162 #LATIN CAPITAL LETTER T WITH CEDILLA +0xDF 0x00DF #LATIN SMALL LETTER SHARP S +0xE0 0x0155 #LATIN SMALL LETTER R WITH ACUTE +0xE1 0x00E1 #LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x0103 #LATIN SMALL LETTER A WITH BREVE +0xE4 0x00E4 #LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x013A #LATIN SMALL LETTER L WITH ACUTE +0xE6 0x0107 #LATIN SMALL LETTER C WITH ACUTE +0xE7 0x00E7 #LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x010D #LATIN SMALL LETTER C WITH CARON +0xE9 0x00E9 #LATIN SMALL LETTER E WITH ACUTE +0xEA 0x0119 #LATIN SMALL LETTER E WITH OGONEK +0xEB 0x00EB #LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x011B #LATIN SMALL LETTER E WITH CARON +0xED 0x00ED #LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE #LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x010F #LATIN SMALL LETTER D WITH CARON +0xF0 0x0111 #LATIN SMALL LETTER D WITH STROKE +0xF1 0x0144 #LATIN SMALL LETTER N WITH ACUTE +0xF2 0x0148 #LATIN SMALL LETTER N WITH CARON +0xF3 0x00F3 #LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x0151 #LATIN SMALL LETTER O WITH DOUBLE ACUTE +0xF6 0x00F6 #LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 #DIVISION SIGN +0xF8 0x0159 #LATIN SMALL LETTER R WITH CARON +0xF9 0x016F #LATIN SMALL LETTER U WITH RING ABOVE +0xFA 0x00FA #LATIN SMALL LETTER U WITH ACUTE +0xFB 0x0171 #LATIN SMALL LETTER U WITH DOUBLE ACUTE +0xFC 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD #LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x0163 #LATIN SMALL LETTER T WITH CEDILLA +0xFF 0x02D9 #DOT ABOVE diff --git a/charsets/cp1251.txt b/charsets/cp1251.txt new file mode 100644 index 0000000..5189b95 --- /dev/null +++ b/charsets/cp1251.txt @@ -0,0 +1,274 @@ +# +# Name: cp1251_WinCyrillic to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1251_WinCyrillic code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1251_WinCyrillic order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x0402 #CYRILLIC CAPITAL LETTER DJE +0x81 0x0403 #CYRILLIC CAPITAL LETTER GJE +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 0x0453 #CYRILLIC SMALL LETTER GJE +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 #UNDEFINED +0x89 0x2030 #PER MILLE SIGN +0x8A 0x0409 #CYRILLIC CAPITAL LETTER LJE +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C 0x040A #CYRILLIC CAPITAL LETTER NJE +0x8D 0x040C #CYRILLIC CAPITAL LETTER KJE +0x8E 0x040B #CYRILLIC CAPITAL LETTER TSHE +0x8F 0x040F #CYRILLIC CAPITAL LETTER DZHE +0x90 0x0452 #CYRILLIC SMALL LETTER DJE +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 #UNDEFINED +0x99 0x2122 #TRADE MARK SIGN +0x9A 0x0459 #CYRILLIC SMALL LETTER LJE +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C 0x045A #CYRILLIC SMALL LETTER NJE +0x9D 0x045C #CYRILLIC SMALL LETTER KJE +0x9E 0x045B #CYRILLIC SMALL LETTER TSHE +0x9F 0x045F #CYRILLIC SMALL LETTER DZHE +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x040E #CYRILLIC CAPITAL LETTER SHORT U +0xA2 0x045E #CYRILLIC SMALL LETTER SHORT U +0xA3 0x0408 #CYRILLIC CAPITAL LETTER JE +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x0490 #CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x0401 #CYRILLIC CAPITAL LETTER IO +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x0404 #CYRILLIC CAPITAL LETTER UKRAINIAN IE +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x0407 #CYRILLIC CAPITAL LETTER YI +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x0406 #CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0xB3 0x0456 #CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +0xB4 0x0491 #CYRILLIC SMALL LETTER GHE WITH UPTURN +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x0451 #CYRILLIC SMALL LETTER IO +0xB9 0x2116 #NUMERO SIGN +0xBA 0x0454 #CYRILLIC SMALL LETTER UKRAINIAN IE +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x0458 #CYRILLIC SMALL LETTER JE +0xBD 0x0405 #CYRILLIC CAPITAL LETTER DZE +0xBE 0x0455 #CYRILLIC SMALL LETTER DZE +0xBF 0x0457 #CYRILLIC SMALL LETTER YI +0xC0 0x0410 #CYRILLIC CAPITAL LETTER A +0xC1 0x0411 #CYRILLIC CAPITAL LETTER BE +0xC2 0x0412 #CYRILLIC CAPITAL LETTER VE +0xC3 0x0413 #CYRILLIC CAPITAL LETTER GHE +0xC4 0x0414 #CYRILLIC CAPITAL LETTER DE +0xC5 0x0415 #CYRILLIC CAPITAL LETTER IE +0xC6 0x0416 #CYRILLIC CAPITAL LETTER ZHE +0xC7 0x0417 #CYRILLIC CAPITAL LETTER ZE +0xC8 0x0418 #CYRILLIC CAPITAL LETTER I +0xC9 0x0419 #CYRILLIC CAPITAL LETTER SHORT I +0xCA 0x041A #CYRILLIC CAPITAL LETTER KA +0xCB 0x041B #CYRILLIC CAPITAL LETTER EL +0xCC 0x041C #CYRILLIC CAPITAL LETTER EM +0xCD 0x041D #CYRILLIC CAPITAL LETTER EN +0xCE 0x041E #CYRILLIC CAPITAL LETTER O +0xCF 0x041F #CYRILLIC CAPITAL LETTER PE +0xD0 0x0420 #CYRILLIC CAPITAL LETTER ER +0xD1 0x0421 #CYRILLIC CAPITAL LETTER ES +0xD2 0x0422 #CYRILLIC CAPITAL LETTER TE +0xD3 0x0423 #CYRILLIC CAPITAL LETTER U +0xD4 0x0424 #CYRILLIC CAPITAL LETTER EF +0xD5 0x0425 #CYRILLIC CAPITAL LETTER HA +0xD6 0x0426 #CYRILLIC CAPITAL LETTER TSE +0xD7 0x0427 #CYRILLIC CAPITAL LETTER CHE +0xD8 0x0428 #CYRILLIC CAPITAL LETTER SHA +0xD9 0x0429 #CYRILLIC CAPITAL LETTER SHCHA +0xDA 0x042A #CYRILLIC CAPITAL LETTER HARD SIGN +0xDB 0x042B #CYRILLIC CAPITAL LETTER YERU +0xDC 0x042C #CYRILLIC CAPITAL LETTER SOFT SIGN +0xDD 0x042D #CYRILLIC CAPITAL LETTER E +0xDE 0x042E #CYRILLIC CAPITAL LETTER YU +0xDF 0x042F #CYRILLIC CAPITAL LETTER YA +0xE0 0x0430 #CYRILLIC SMALL LETTER A +0xE1 0x0431 #CYRILLIC SMALL LETTER BE +0xE2 0x0432 #CYRILLIC SMALL LETTER VE +0xE3 0x0433 #CYRILLIC SMALL LETTER GHE +0xE4 0x0434 #CYRILLIC SMALL LETTER DE +0xE5 0x0435 #CYRILLIC SMALL LETTER IE +0xE6 0x0436 #CYRILLIC SMALL LETTER ZHE +0xE7 0x0437 #CYRILLIC SMALL LETTER ZE +0xE8 0x0438 #CYRILLIC SMALL LETTER I +0xE9 0x0439 #CYRILLIC SMALL LETTER SHORT I +0xEA 0x043A #CYRILLIC SMALL LETTER KA +0xEB 0x043B #CYRILLIC SMALL LETTER EL +0xEC 0x043C #CYRILLIC SMALL LETTER EM +0xED 0x043D #CYRILLIC SMALL LETTER EN +0xEE 0x043E #CYRILLIC SMALL LETTER O +0xEF 0x043F #CYRILLIC SMALL LETTER PE +0xF0 0x0440 #CYRILLIC SMALL LETTER ER +0xF1 0x0441 #CYRILLIC SMALL LETTER ES +0xF2 0x0442 #CYRILLIC SMALL LETTER TE +0xF3 0x0443 #CYRILLIC SMALL LETTER U +0xF4 0x0444 #CYRILLIC SMALL LETTER EF +0xF5 0x0445 #CYRILLIC SMALL LETTER HA +0xF6 0x0446 #CYRILLIC SMALL LETTER TSE +0xF7 0x0447 #CYRILLIC SMALL LETTER CHE +0xF8 0x0448 #CYRILLIC SMALL LETTER SHA +0xF9 0x0449 #CYRILLIC SMALL LETTER SHCHA +0xFA 0x044A #CYRILLIC SMALL LETTER HARD SIGN +0xFB 0x044B #CYRILLIC SMALL LETTER YERU +0xFC 0x044C #CYRILLIC SMALL LETTER SOFT SIGN +0xFD 0x044D #CYRILLIC SMALL LETTER E +0xFE 0x044E #CYRILLIC SMALL LETTER YU +0xFF 0x044F #CYRILLIC SMALL LETTER YA + diff --git a/charsets/cp1252.txt b/charsets/cp1252.txt new file mode 100644 index 0000000..066f3c5 --- /dev/null +++ b/charsets/cp1252.txt @@ -0,0 +1,274 @@ +# +# Name: cp1252 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1252 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1252 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT +0x89 0x2030 #PER MILLE SIGN +0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C 0x0152 #LATIN CAPITAL LIGATURE OE +0x8D #UNDEFINED +0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON +0x8F #UNDEFINED +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 0x02DC #SMALL TILDE +0x99 0x2122 #TRADE MARK SIGN +0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C 0x0153 #LATIN SMALL LIGATURE OE +0x9D #UNDEFINED +0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON +0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x00A1 #INVERTED EXCLAMATION MARK +0xA2 0x00A2 #CENT SIGN +0xA3 0x00A3 #POUND SIGN +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x00A5 #YEN SIGN +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00A8 #DIAERESIS +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x00AA #FEMININE ORDINAL INDICATOR +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x00AF #MACRON +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x00B2 #SUPERSCRIPT TWO +0xB3 0x00B3 #SUPERSCRIPT THREE +0xB4 0x00B4 #ACUTE ACCENT +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x00B8 #CEDILLA +0xB9 0x00B9 #SUPERSCRIPT ONE +0xBA 0x00BA #MASCULINE ORDINAL INDICATOR +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC #VULGAR FRACTION ONE QUARTER +0xBD 0x00BD #VULGAR FRACTION ONE HALF +0xBE 0x00BE #VULGAR FRACTION THREE QUARTERS +0xBF 0x00BF #INVERTED QUESTION MARK +0xC0 0x00C0 #LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 #LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 #LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 #LATIN CAPITAL LETTER AE +0xC7 0x00C7 #LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 #LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 #LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA #LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB #LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x00CC #LATIN CAPITAL LETTER I WITH GRAVE +0xCD 0x00CD #LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF #LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x00D0 #LATIN CAPITAL LETTER ETH +0xD1 0x00D1 #LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x00D2 #LATIN CAPITAL LETTER O WITH GRAVE +0xD3 0x00D3 #LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 #LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 #MULTIPLICATION SIGN +0xD8 0x00D8 #LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x00D9 #LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA #LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB #LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC #LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD #LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x00DE #LATIN CAPITAL LETTER THORN +0xDF 0x00DF #LATIN SMALL LETTER SHARP S +0xE0 0x00E0 #LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 #LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 #LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 #LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 #LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 #LATIN SMALL LETTER AE +0xE7 0x00E7 #LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 #LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 #LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA #LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB #LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x00EC #LATIN SMALL LETTER I WITH GRAVE +0xED 0x00ED #LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE #LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF #LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x00F0 #LATIN SMALL LETTER ETH +0xF1 0x00F1 #LATIN SMALL LETTER N WITH TILDE +0xF2 0x00F2 #LATIN SMALL LETTER O WITH GRAVE +0xF3 0x00F3 #LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 #LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 #LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 #DIVISION SIGN +0xF8 0x00F8 #LATIN SMALL LETTER O WITH STROKE +0xF9 0x00F9 #LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA #LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB #LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD #LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x00FE #LATIN SMALL LETTER THORN +0xFF 0x00FF #LATIN SMALL LETTER Y WITH DIAERESIS diff --git a/charsets/cp1253.txt b/charsets/cp1253.txt new file mode 100644 index 0000000..4d05935 --- /dev/null +++ b/charsets/cp1253.txt @@ -0,0 +1,274 @@ +# +# Name: cp1253 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1253 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1253 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 #UNDEFINED +0x89 0x2030 #PER MILLE SIGN +0x8A #UNDEFINED +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C #UNDEFINED +0x8D #UNDEFINED +0x8E #UNDEFINED +0x8F #UNDEFINED +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 #UNDEFINED +0x99 0x2122 #TRADE MARK SIGN +0x9A #UNDEFINED +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C #UNDEFINED +0x9D #UNDEFINED +0x9E #UNDEFINED +0x9F #UNDEFINED +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x0385 #GREEK DIALYTIKA TONOS +0xA2 0x0386 #GREEK CAPITAL LETTER ALPHA WITH TONOS +0xA3 0x00A3 #POUND SIGN +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x00A5 #YEN SIGN +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00A8 #DIAERESIS +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA #UNDEFINED +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x2015 #HORIZONTAL BAR +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x00B2 #SUPERSCRIPT TWO +0xB3 0x00B3 #SUPERSCRIPT THREE +0xB4 0x0384 #GREEK TONOS +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x0388 #GREEK CAPITAL LETTER EPSILON WITH TONOS +0xB9 0x0389 #GREEK CAPITAL LETTER ETA WITH TONOS +0xBA 0x038A #GREEK CAPITAL LETTER IOTA WITH TONOS +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x038C #GREEK CAPITAL LETTER OMICRON WITH TONOS +0xBD 0x00BD #VULGAR FRACTION ONE HALF +0xBE 0x038E #GREEK CAPITAL LETTER UPSILON WITH TONOS +0xBF 0x038F #GREEK CAPITAL LETTER OMEGA WITH TONOS +0xC0 0x0390 #GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0xC1 0x0391 #GREEK CAPITAL LETTER ALPHA +0xC2 0x0392 #GREEK CAPITAL LETTER BETA +0xC3 0x0393 #GREEK CAPITAL LETTER GAMMA +0xC4 0x0394 #GREEK CAPITAL LETTER DELTA +0xC5 0x0395 #GREEK CAPITAL LETTER EPSILON +0xC6 0x0396 #GREEK CAPITAL LETTER ZETA +0xC7 0x0397 #GREEK CAPITAL LETTER ETA +0xC8 0x0398 #GREEK CAPITAL LETTER THETA +0xC9 0x0399 #GREEK CAPITAL LETTER IOTA +0xCA 0x039A #GREEK CAPITAL LETTER KAPPA +0xCB 0x039B #GREEK CAPITAL LETTER LAMDA +0xCC 0x039C #GREEK CAPITAL LETTER MU +0xCD 0x039D #GREEK CAPITAL LETTER NU +0xCE 0x039E #GREEK CAPITAL LETTER XI +0xCF 0x039F #GREEK CAPITAL LETTER OMICRON +0xD0 0x03A0 #GREEK CAPITAL LETTER PI +0xD1 0x03A1 #GREEK CAPITAL LETTER RHO +0xD2 #UNDEFINED +0xD3 0x03A3 #GREEK CAPITAL LETTER SIGMA +0xD4 0x03A4 #GREEK CAPITAL LETTER TAU +0xD5 0x03A5 #GREEK CAPITAL LETTER UPSILON +0xD6 0x03A6 #GREEK CAPITAL LETTER PHI +0xD7 0x03A7 #GREEK CAPITAL LETTER CHI +0xD8 0x03A8 #GREEK CAPITAL LETTER PSI +0xD9 0x03A9 #GREEK CAPITAL LETTER OMEGA +0xDA 0x03AA #GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +0xDB 0x03AB #GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +0xDC 0x03AC #GREEK SMALL LETTER ALPHA WITH TONOS +0xDD 0x03AD #GREEK SMALL LETTER EPSILON WITH TONOS +0xDE 0x03AE #GREEK SMALL LETTER ETA WITH TONOS +0xDF 0x03AF #GREEK SMALL LETTER IOTA WITH TONOS +0xE0 0x03B0 #GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +0xE1 0x03B1 #GREEK SMALL LETTER ALPHA +0xE2 0x03B2 #GREEK SMALL LETTER BETA +0xE3 0x03B3 #GREEK SMALL LETTER GAMMA +0xE4 0x03B4 #GREEK SMALL LETTER DELTA +0xE5 0x03B5 #GREEK SMALL LETTER EPSILON +0xE6 0x03B6 #GREEK SMALL LETTER ZETA +0xE7 0x03B7 #GREEK SMALL LETTER ETA +0xE8 0x03B8 #GREEK SMALL LETTER THETA +0xE9 0x03B9 #GREEK SMALL LETTER IOTA +0xEA 0x03BA #GREEK SMALL LETTER KAPPA +0xEB 0x03BB #GREEK SMALL LETTER LAMDA +0xEC 0x03BC #GREEK SMALL LETTER MU +0xED 0x03BD #GREEK SMALL LETTER NU +0xEE 0x03BE #GREEK SMALL LETTER XI +0xEF 0x03BF #GREEK SMALL LETTER OMICRON +0xF0 0x03C0 #GREEK SMALL LETTER PI +0xF1 0x03C1 #GREEK SMALL LETTER RHO +0xF2 0x03C2 #GREEK SMALL LETTER FINAL SIGMA +0xF3 0x03C3 #GREEK SMALL LETTER SIGMA +0xF4 0x03C4 #GREEK SMALL LETTER TAU +0xF5 0x03C5 #GREEK SMALL LETTER UPSILON +0xF6 0x03C6 #GREEK SMALL LETTER PHI +0xF7 0x03C7 #GREEK SMALL LETTER CHI +0xF8 0x03C8 #GREEK SMALL LETTER PSI +0xF9 0x03C9 #GREEK SMALL LETTER OMEGA +0xFA 0x03CA #GREEK SMALL LETTER IOTA WITH DIALYTIKA +0xFB 0x03CB #GREEK SMALL LETTER UPSILON WITH DIALYTIKA +0xFC 0x03CC #GREEK SMALL LETTER OMICRON WITH TONOS +0xFD 0x03CD #GREEK SMALL LETTER UPSILON WITH TONOS +0xFE 0x03CE #GREEK SMALL LETTER OMEGA WITH TONOS +0xFF #UNDEFINED diff --git a/charsets/cp1254.txt b/charsets/cp1254.txt new file mode 100644 index 0000000..b20bfdf --- /dev/null +++ b/charsets/cp1254.txt @@ -0,0 +1,274 @@ +# +# Name: cp1254 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1254 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1254 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT +0x89 0x2030 #PER MILLE SIGN +0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C 0x0152 #LATIN CAPITAL LIGATURE OE +0x8D #UNDEFINED +0x8E #UNDEFINED +0x8F #UNDEFINED +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 0x02DC #SMALL TILDE +0x99 0x2122 #TRADE MARK SIGN +0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C 0x0153 #LATIN SMALL LIGATURE OE +0x9D #UNDEFINED +0x9E #UNDEFINED +0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x00A1 #INVERTED EXCLAMATION MARK +0xA2 0x00A2 #CENT SIGN +0xA3 0x00A3 #POUND SIGN +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x00A5 #YEN SIGN +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00A8 #DIAERESIS +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x00AA #FEMININE ORDINAL INDICATOR +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x00AF #MACRON +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x00B2 #SUPERSCRIPT TWO +0xB3 0x00B3 #SUPERSCRIPT THREE +0xB4 0x00B4 #ACUTE ACCENT +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x00B8 #CEDILLA +0xB9 0x00B9 #SUPERSCRIPT ONE +0xBA 0x00BA #MASCULINE ORDINAL INDICATOR +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC #VULGAR FRACTION ONE QUARTER +0xBD 0x00BD #VULGAR FRACTION ONE HALF +0xBE 0x00BE #VULGAR FRACTION THREE QUARTERS +0xBF 0x00BF #INVERTED QUESTION MARK +0xC0 0x00C0 #LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 #LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 #LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 #LATIN CAPITAL LETTER AE +0xC7 0x00C7 #LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 #LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 #LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA #LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB #LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x00CC #LATIN CAPITAL LETTER I WITH GRAVE +0xCD 0x00CD #LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF #LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x011E #LATIN CAPITAL LETTER G WITH BREVE +0xD1 0x00D1 #LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x00D2 #LATIN CAPITAL LETTER O WITH GRAVE +0xD3 0x00D3 #LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 #LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 #MULTIPLICATION SIGN +0xD8 0x00D8 #LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x00D9 #LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA #LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB #LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC #LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x0130 #LATIN CAPITAL LETTER I WITH DOT ABOVE +0xDE 0x015E #LATIN CAPITAL LETTER S WITH CEDILLA +0xDF 0x00DF #LATIN SMALL LETTER SHARP S +0xE0 0x00E0 #LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 #LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 #LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 #LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 #LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 #LATIN SMALL LETTER AE +0xE7 0x00E7 #LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 #LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 #LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA #LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB #LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x00EC #LATIN SMALL LETTER I WITH GRAVE +0xED 0x00ED #LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE #LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF #LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x011F #LATIN SMALL LETTER G WITH BREVE +0xF1 0x00F1 #LATIN SMALL LETTER N WITH TILDE +0xF2 0x00F2 #LATIN SMALL LETTER O WITH GRAVE +0xF3 0x00F3 #LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 #LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 #LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 #DIVISION SIGN +0xF8 0x00F8 #LATIN SMALL LETTER O WITH STROKE +0xF9 0x00F9 #LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA #LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB #LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x0131 #LATIN SMALL LETTER DOTLESS I +0xFE 0x015F #LATIN SMALL LETTER S WITH CEDILLA +0xFF 0x00FF #LATIN SMALL LETTER Y WITH DIAERESIS diff --git a/charsets/cp1255.txt b/charsets/cp1255.txt new file mode 100644 index 0000000..9feb4e7 --- /dev/null +++ b/charsets/cp1255.txt @@ -0,0 +1,274 @@ +# +# Name: cp1255 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1255 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1255 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT +0x89 0x2030 #PER MILLE SIGN +0x8A #UNDEFINED +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C #UNDEFINED +0x8D #UNDEFINED +0x8E #UNDEFINED +0x8F #UNDEFINED +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 0x02DC #SMALL TILDE +0x99 0x2122 #TRADE MARK SIGN +0x9A #UNDEFINED +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C #UNDEFINED +0x9D #UNDEFINED +0x9E #UNDEFINED +0x9F #UNDEFINED +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x00A1 #INVERTED EXCLAMATION MARK +0xA2 0x00A2 #CENT SIGN +0xA3 0x00A3 #POUND SIGN +0xA4 0x20AA #NEW SHEQEL SIGN +0xA5 0x00A5 #YEN SIGN +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00A8 #DIAERESIS +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x00D7 #MULTIPLICATION SIGN +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x00AF #MACRON +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x00B2 #SUPERSCRIPT TWO +0xB3 0x00B3 #SUPERSCRIPT THREE +0xB4 0x00B4 #ACUTE ACCENT +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x00B8 #CEDILLA +0xB9 0x00B9 #SUPERSCRIPT ONE +0xBA 0x00F7 #DIVISION SIGN +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC #VULGAR FRACTION ONE QUARTER +0xBD 0x00BD #VULGAR FRACTION ONE HALF +0xBE 0x00BE #VULGAR FRACTION THREE QUARTERS +0xBF 0x00BF #INVERTED QUESTION MARK +0xC0 0x05B0 #HEBREW POINT SHEVA +0xC1 0x05B1 #HEBREW POINT HATAF SEGOL +0xC2 0x05B2 #HEBREW POINT HATAF PATAH +0xC3 0x05B3 #HEBREW POINT HATAF QAMATS +0xC4 0x05B4 #HEBREW POINT HIRIQ +0xC5 0x05B5 #HEBREW POINT TSERE +0xC6 0x05B6 #HEBREW POINT SEGOL +0xC7 0x05B7 #HEBREW POINT PATAH +0xC8 0x05B8 #HEBREW POINT QAMATS +0xC9 0x05B9 #HEBREW POINT HOLAM +0xCA #UNDEFINED +0xCB 0x05BB #HEBREW POINT QUBUTS +0xCC 0x05BC #HEBREW POINT DAGESH OR MAPIQ +0xCD 0x05BD #HEBREW POINT METEG +0xCE 0x05BE #HEBREW PUNCTUATION MAQAF +0xCF 0x05BF #HEBREW POINT RAFE +0xD0 0x05C0 #HEBREW PUNCTUATION PASEQ +0xD1 0x05C1 #HEBREW POINT SHIN DOT +0xD2 0x05C2 #HEBREW POINT SIN DOT +0xD3 0x05C3 #HEBREW PUNCTUATION SOF PASUQ +0xD4 0x05F0 #HEBREW LIGATURE YIDDISH DOUBLE VAV +0xD5 0x05F1 #HEBREW LIGATURE YIDDISH VAV YOD +0xD6 0x05F2 #HEBREW LIGATURE YIDDISH DOUBLE YOD +0xD7 0x05F3 #HEBREW PUNCTUATION GERESH +0xD8 0x05F4 #HEBREW PUNCTUATION GERSHAYIM +0xD9 #UNDEFINED +0xDA #UNDEFINED +0xDB #UNDEFINED +0xDC #UNDEFINED +0xDD #UNDEFINED +0xDE #UNDEFINED +0xDF #UNDEFINED +0xE0 0x05D0 #HEBREW LETTER ALEF +0xE1 0x05D1 #HEBREW LETTER BET +0xE2 0x05D2 #HEBREW LETTER GIMEL +0xE3 0x05D3 #HEBREW LETTER DALET +0xE4 0x05D4 #HEBREW LETTER HE +0xE5 0x05D5 #HEBREW LETTER VAV +0xE6 0x05D6 #HEBREW LETTER ZAYIN +0xE7 0x05D7 #HEBREW LETTER HET +0xE8 0x05D8 #HEBREW LETTER TET +0xE9 0x05D9 #HEBREW LETTER YOD +0xEA 0x05DA #HEBREW LETTER FINAL KAF +0xEB 0x05DB #HEBREW LETTER KAF +0xEC 0x05DC #HEBREW LETTER LAMED +0xED 0x05DD #HEBREW LETTER FINAL MEM +0xEE 0x05DE #HEBREW LETTER MEM +0xEF 0x05DF #HEBREW LETTER FINAL NUN +0xF0 0x05E0 #HEBREW LETTER NUN +0xF1 0x05E1 #HEBREW LETTER SAMEKH +0xF2 0x05E2 #HEBREW LETTER AYIN +0xF3 0x05E3 #HEBREW LETTER FINAL PE +0xF4 0x05E4 #HEBREW LETTER PE +0xF5 0x05E5 #HEBREW LETTER FINAL TSADI +0xF6 0x05E6 #HEBREW LETTER TSADI +0xF7 0x05E7 #HEBREW LETTER QOF +0xF8 0x05E8 #HEBREW LETTER RESH +0xF9 0x05E9 #HEBREW LETTER SHIN +0xFA 0x05EA #HEBREW LETTER TAV +0xFB #UNDEFINED +0xFC #UNDEFINED +0xFD 0x200E #LEFT-TO-RIGHT MARK +0xFE 0x200F #RIGHT-TO-LEFT MARK +0xFF #UNDEFINED diff --git a/charsets/cp1256.txt b/charsets/cp1256.txt new file mode 100644 index 0000000..6301e4c --- /dev/null +++ b/charsets/cp1256.txt @@ -0,0 +1,274 @@ +# +# Name: cp1256 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1256 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1256 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 0x067E #ARABIC LETTER PEH +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT +0x89 0x2030 #PER MILLE SIGN +0x8A #UNDEFINED +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C 0x0152 #LATIN CAPITAL LIGATURE OE +0x8D 0x0686 #ARABIC LETTER TCHEH +0x8E 0x0698 #ARABIC LETTER JEH +0x8F #UNDEFINED +0x90 0x06AF #ARABIC LETTER GAF +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 #UNDEFINED +0x99 0x2122 #TRADE MARK SIGN +0x9A #UNDEFINED +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C 0x0153 #LATIN SMALL LIGATURE OE +0x9D 0x200C #ZERO WIDTH NON-JOINER +0x9E 0x200D #ZERO WIDTH JOINER +0x9F #UNDEFINED +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x060C #ARABIC COMMA +0xA2 0x00A2 #CENT SIGN +0xA3 0x00A3 #POUND SIGN +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x00A5 #YEN SIGN +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00A8 #DIAERESIS +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA #UNDEFINED +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x00AF #MACRON +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x00B2 #SUPERSCRIPT TWO +0xB3 0x00B3 #SUPERSCRIPT THREE +0xB4 0x00B4 #ACUTE ACCENT +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x00B8 #CEDILLA +0xB9 0x00B9 #SUPERSCRIPT ONE +0xBA 0x061B #ARABIC SEMICOLON +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC #VULGAR FRACTION ONE QUARTER +0xBD 0x00BD #VULGAR FRACTION ONE HALF +0xBE 0x00BE #VULGAR FRACTION THREE QUARTERS +0xBF 0x061F #ARABIC QUESTION MARK +0xC0 #UNDEFINED +0xC1 0x0621 #ARABIC LETTER HAMZA +0xC2 0x0622 #ARABIC LETTER ALEF WITH MADDA ABOVE +0xC3 0x0623 #ARABIC LETTER ALEF WITH HAMZA ABOVE +0xC4 0x0624 #ARABIC LETTER WAW WITH HAMZA ABOVE +0xC5 0x0625 #ARABIC LETTER ALEF WITH HAMZA BELOW +0xC6 0x0626 #ARABIC LETTER YEH WITH HAMZA ABOVE +0xC7 0x0627 #ARABIC LETTER ALEF +0xC8 0x0628 #ARABIC LETTER BEH +0xC9 0x0629 #ARABIC LETTER TEH MARBUTA +0xCA 0x062A #ARABIC LETTER TEH +0xCB 0x062B #ARABIC LETTER THEH +0xCC 0x062C #ARABIC LETTER JEEM +0xCD 0x062D #ARABIC LETTER HAH +0xCE 0x062E #ARABIC LETTER KHAH +0xCF 0x062F #ARABIC LETTER DAL +0xD0 0x0630 #ARABIC LETTER THAL +0xD1 0x0631 #ARABIC LETTER REH +0xD2 0x0632 #ARABIC LETTER ZAIN +0xD3 0x0633 #ARABIC LETTER SEEN +0xD4 0x0634 #ARABIC LETTER SHEEN +0xD5 0x0635 #ARABIC LETTER SAD +0xD6 0x0636 #ARABIC LETTER DAD +0xD7 0x00D7 #MULTIPLICATION SIGN +0xD8 0x0637 #ARABIC LETTER TAH +0xD9 0x0638 #ARABIC LETTER ZAH +0xDA 0x0639 #ARABIC LETTER AIN +0xDB 0x063A #ARABIC LETTER GHAIN +0xDC 0x0640 #ARABIC TATWEEL +0xDD 0x0641 #ARABIC LETTER FEH +0xDE 0x0642 #ARABIC LETTER QAF +0xDF 0x0643 #ARABIC LETTER KAF +0xE0 0x00E0 #LATIN SMALL LETTER A WITH GRAVE +0xE1 0x0644 #ARABIC LETTER LAM +0xE2 0x00E2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x0645 #ARABIC LETTER MEEM +0xE4 0x0646 #ARABIC LETTER NOON +0xE5 0x0647 #ARABIC LETTER HEH +0xE6 0x0648 #ARABIC LETTER WAW +0xE7 0x00E7 #LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 #LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 #LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA #LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB #LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x0649 #ARABIC LETTER ALEF MAKSURA +0xED 0x064A #ARABIC LETTER YEH +0xEE 0x00EE #LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF #LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x064B #ARABIC FATHATAN +0xF1 0x064C #ARABIC DAMMATAN +0xF2 0x064D #ARABIC KASRATAN +0xF3 0x064E #ARABIC FATHA +0xF4 0x00F4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x064F #ARABIC DAMMA +0xF6 0x0650 #ARABIC KASRA +0xF7 0x00F7 #DIVISION SIGN +0xF8 0x0651 #ARABIC SHADDA +0xF9 0x00F9 #LATIN SMALL LETTER U WITH GRAVE +0xFA 0x0652 #ARABIC SUKUN +0xFB 0x00FB #LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x200E #LEFT-TO-RIGHT MARK +0xFE 0x200F #RIGHT-TO-LEFT MARK +0xFF #UNDEFINED diff --git a/charsets/cp1257.txt b/charsets/cp1257.txt new file mode 100644 index 0000000..ca5f9a9 --- /dev/null +++ b/charsets/cp1257.txt @@ -0,0 +1,274 @@ +# +# Name: cp1257 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1257 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1257 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 #UNDEFINED +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 #UNDEFINED +0x89 0x2030 #PER MILLE SIGN +0x8A #UNDEFINED +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C #UNDEFINED +0x8D 0x00A8 #DIAERESIS +0x8E 0x02C7 #CARON +0x8F 0x00B8 #CEDILLA +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 #UNDEFINED +0x99 0x2122 #TRADE MARK SIGN +0x9A #UNDEFINED +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C #UNDEFINED +0x9D 0x00AF #MACRON +0x9E 0x02DB #OGONEK +0x9F #UNDEFINED +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 #UNDEFINED +0xA2 0x00A2 #CENT SIGN +0xA3 0x00A3 #POUND SIGN +0xA4 0x00A4 #CURRENCY SIGN +0xA5 #UNDEFINED +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00D8 #LATIN CAPITAL LETTER O WITH STROKE +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x0156 #LATIN CAPITAL LETTER R WITH CEDILLA +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x00C6 #LATIN CAPITAL LETTER AE +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x00B2 #SUPERSCRIPT TWO +0xB3 0x00B3 #SUPERSCRIPT THREE +0xB4 0x00B4 #ACUTE ACCENT +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x00F8 #LATIN SMALL LETTER O WITH STROKE +0xB9 0x00B9 #SUPERSCRIPT ONE +0xBA 0x0157 #LATIN SMALL LETTER R WITH CEDILLA +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC #VULGAR FRACTION ONE QUARTER +0xBD 0x00BD #VULGAR FRACTION ONE HALF +0xBE 0x00BE #VULGAR FRACTION THREE QUARTERS +0xBF 0x00E6 #LATIN SMALL LETTER AE +0xC0 0x0104 #LATIN CAPITAL LETTER A WITH OGONEK +0xC1 0x012E #LATIN CAPITAL LETTER I WITH OGONEK +0xC2 0x0100 #LATIN CAPITAL LETTER A WITH MACRON +0xC3 0x0106 #LATIN CAPITAL LETTER C WITH ACUTE +0xC4 0x00C4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x0118 #LATIN CAPITAL LETTER E WITH OGONEK +0xC7 0x0112 #LATIN CAPITAL LETTER E WITH MACRON +0xC8 0x010C #LATIN CAPITAL LETTER C WITH CARON +0xC9 0x00C9 #LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x0179 #LATIN CAPITAL LETTER Z WITH ACUTE +0xCB 0x0116 #LATIN CAPITAL LETTER E WITH DOT ABOVE +0xCC 0x0122 #LATIN CAPITAL LETTER G WITH CEDILLA +0xCD 0x0136 #LATIN CAPITAL LETTER K WITH CEDILLA +0xCE 0x012A #LATIN CAPITAL LETTER I WITH MACRON +0xCF 0x013B #LATIN CAPITAL LETTER L WITH CEDILLA +0xD0 0x0160 #LATIN CAPITAL LETTER S WITH CARON +0xD1 0x0143 #LATIN CAPITAL LETTER N WITH ACUTE +0xD2 0x0145 #LATIN CAPITAL LETTER N WITH CEDILLA +0xD3 0x00D3 #LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x014C #LATIN CAPITAL LETTER O WITH MACRON +0xD5 0x00D5 #LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 #MULTIPLICATION SIGN +0xD8 0x0172 #LATIN CAPITAL LETTER U WITH OGONEK +0xD9 0x0141 #LATIN CAPITAL LETTER L WITH STROKE +0xDA 0x015A #LATIN CAPITAL LETTER S WITH ACUTE +0xDB 0x016A #LATIN CAPITAL LETTER U WITH MACRON +0xDC 0x00DC #LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x017B #LATIN CAPITAL LETTER Z WITH DOT ABOVE +0xDE 0x017D #LATIN CAPITAL LETTER Z WITH CARON +0xDF 0x00DF #LATIN SMALL LETTER SHARP S +0xE0 0x0105 #LATIN SMALL LETTER A WITH OGONEK +0xE1 0x012F #LATIN SMALL LETTER I WITH OGONEK +0xE2 0x0101 #LATIN SMALL LETTER A WITH MACRON +0xE3 0x0107 #LATIN SMALL LETTER C WITH ACUTE +0xE4 0x00E4 #LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 #LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x0119 #LATIN SMALL LETTER E WITH OGONEK +0xE7 0x0113 #LATIN SMALL LETTER E WITH MACRON +0xE8 0x010D #LATIN SMALL LETTER C WITH CARON +0xE9 0x00E9 #LATIN SMALL LETTER E WITH ACUTE +0xEA 0x017A #LATIN SMALL LETTER Z WITH ACUTE +0xEB 0x0117 #LATIN SMALL LETTER E WITH DOT ABOVE +0xEC 0x0123 #LATIN SMALL LETTER G WITH CEDILLA +0xED 0x0137 #LATIN SMALL LETTER K WITH CEDILLA +0xEE 0x012B #LATIN SMALL LETTER I WITH MACRON +0xEF 0x013C #LATIN SMALL LETTER L WITH CEDILLA +0xF0 0x0161 #LATIN SMALL LETTER S WITH CARON +0xF1 0x0144 #LATIN SMALL LETTER N WITH ACUTE +0xF2 0x0146 #LATIN SMALL LETTER N WITH CEDILLA +0xF3 0x00F3 #LATIN SMALL LETTER O WITH ACUTE +0xF4 0x014D #LATIN SMALL LETTER O WITH MACRON +0xF5 0x00F5 #LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 #LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 #DIVISION SIGN +0xF8 0x0173 #LATIN SMALL LETTER U WITH OGONEK +0xF9 0x0142 #LATIN SMALL LETTER L WITH STROKE +0xFA 0x015B #LATIN SMALL LETTER S WITH ACUTE +0xFB 0x016B #LATIN SMALL LETTER U WITH MACRON +0xFC 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x017C #LATIN SMALL LETTER Z WITH DOT ABOVE +0xFE 0x017E #LATIN SMALL LETTER Z WITH CARON +0xFF 0x02D9 #DOT ABOVE diff --git a/charsets/cp1258.txt b/charsets/cp1258.txt new file mode 100644 index 0000000..8841b64 --- /dev/null +++ b/charsets/cp1258.txt @@ -0,0 +1,274 @@ +# +# Name: cp1258 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1258 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1258 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT +0x89 0x2030 #PER MILLE SIGN +0x8A #UNDEFINED +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C 0x0152 #LATIN CAPITAL LIGATURE OE +0x8D #UNDEFINED +0x8E #UNDEFINED +0x8F #UNDEFINED +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 0x02DC #SMALL TILDE +0x99 0x2122 #TRADE MARK SIGN +0x9A #UNDEFINED +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C 0x0153 #LATIN SMALL LIGATURE OE +0x9D #UNDEFINED +0x9E #UNDEFINED +0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x00A1 #INVERTED EXCLAMATION MARK +0xA2 0x00A2 #CENT SIGN +0xA3 0x00A3 #POUND SIGN +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x00A5 #YEN SIGN +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00A8 #DIAERESIS +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x00AA #FEMININE ORDINAL INDICATOR +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x00AF #MACRON +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x00B2 #SUPERSCRIPT TWO +0xB3 0x00B3 #SUPERSCRIPT THREE +0xB4 0x00B4 #ACUTE ACCENT +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x00B8 #CEDILLA +0xB9 0x00B9 #SUPERSCRIPT ONE +0xBA 0x00BA #MASCULINE ORDINAL INDICATOR +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC #VULGAR FRACTION ONE QUARTER +0xBD 0x00BD #VULGAR FRACTION ONE HALF +0xBE 0x00BE #VULGAR FRACTION THREE QUARTERS +0xBF 0x00BF #INVERTED QUESTION MARK +0xC0 0x00C0 #LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 #LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x0102 #LATIN CAPITAL LETTER A WITH BREVE +0xC4 0x00C4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 #LATIN CAPITAL LETTER AE +0xC7 0x00C7 #LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 #LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 #LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA #LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB #LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x0300 #COMBINING GRAVE ACCENT +0xCD 0x00CD #LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF #LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x0110 #LATIN CAPITAL LETTER D WITH STROKE +0xD1 0x00D1 #LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x0309 #COMBINING HOOK ABOVE +0xD3 0x00D3 #LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x01A0 #LATIN CAPITAL LETTER O WITH HORN +0xD6 0x00D6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 #MULTIPLICATION SIGN +0xD8 0x00D8 #LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x00D9 #LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA #LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB #LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC #LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x01AF #LATIN CAPITAL LETTER U WITH HORN +0xDE 0x0303 #COMBINING TILDE +0xDF 0x00DF #LATIN SMALL LETTER SHARP S +0xE0 0x00E0 #LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 #LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x0103 #LATIN SMALL LETTER A WITH BREVE +0xE4 0x00E4 #LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 #LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 #LATIN SMALL LETTER AE +0xE7 0x00E7 #LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 #LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 #LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA #LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB #LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x0301 #COMBINING ACUTE ACCENT +0xED 0x00ED #LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE #LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF #LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x0111 #LATIN SMALL LETTER D WITH STROKE +0xF1 0x00F1 #LATIN SMALL LETTER N WITH TILDE +0xF2 0x0323 #COMBINING DOT BELOW +0xF3 0x00F3 #LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x01A1 #LATIN SMALL LETTER O WITH HORN +0xF6 0x00F6 #LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 #DIVISION SIGN +0xF8 0x00F8 #LATIN SMALL LETTER O WITH STROKE +0xF9 0x00F9 #LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA #LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB #LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x01B0 #LATIN SMALL LETTER U WITH HORN +0xFE 0x20AB #DONG SIGN +0xFF 0x00FF #LATIN SMALL LETTER Y WITH DIAERESIS diff --git a/charsets/cp437.txt b/charsets/cp437.txt new file mode 100644 index 0000000..479076f --- /dev/null +++ b/charsets/cp437.txt @@ -0,0 +1,274 @@ +# +# Name: cp437_DOSLatinUS to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp437_DOSLatinUS code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp437_DOSLatinUS order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x81 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x82 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x83 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x84 0x00e4 #LATIN SMALL LETTER A WITH DIAERESIS +0x85 0x00e0 #LATIN SMALL LETTER A WITH GRAVE +0x86 0x00e5 #LATIN SMALL LETTER A WITH RING ABOVE +0x87 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x88 0x00ea #LATIN SMALL LETTER E WITH CIRCUMFLEX +0x89 0x00eb #LATIN SMALL LETTER E WITH DIAERESIS +0x8a 0x00e8 #LATIN SMALL LETTER E WITH GRAVE +0x8b 0x00ef #LATIN SMALL LETTER I WITH DIAERESIS +0x8c 0x00ee #LATIN SMALL LETTER I WITH CIRCUMFLEX +0x8d 0x00ec #LATIN SMALL LETTER I WITH GRAVE +0x8e 0x00c4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0x8f 0x00c5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0x90 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x91 0x00e6 #LATIN SMALL LIGATURE AE +0x92 0x00c6 #LATIN CAPITAL LIGATURE AE +0x93 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x94 0x00f6 #LATIN SMALL LETTER O WITH DIAERESIS +0x95 0x00f2 #LATIN SMALL LETTER O WITH GRAVE +0x96 0x00fb #LATIN SMALL LETTER U WITH CIRCUMFLEX +0x97 0x00f9 #LATIN SMALL LETTER U WITH GRAVE +0x98 0x00ff #LATIN SMALL LETTER Y WITH DIAERESIS +0x99 0x00d6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0x9a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x9b 0x00a2 #CENT SIGN +0x9c 0x00a3 #POUND SIGN +0x9d 0x00a5 #YEN SIGN +0x9e 0x20a7 #PESETA SIGN +0x9f 0x0192 #LATIN SMALL LETTER F WITH HOOK +0xa0 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0xa1 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x00f1 #LATIN SMALL LETTER N WITH TILDE +0xa5 0x00d1 #LATIN CAPITAL LETTER N WITH TILDE +0xa6 0x00aa #FEMININE ORDINAL INDICATOR +0xa7 0x00ba #MASCULINE ORDINAL INDICATOR +0xa8 0x00bf #INVERTED QUESTION MARK +0xa9 0x2310 #REVERSED NOT SIGN +0xaa 0x00ac #NOT SIGN +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x00bc #VULGAR FRACTION ONE QUARTER +0xad 0x00a1 #INVERTED EXCLAMATION MARK +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x2561 #BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xb6 0x2562 #BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0xb7 0x2556 #BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0xb8 0x2555 #BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x255c #BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0xbe 0x255b #BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x255e #BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xc7 0x255f #BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x2567 #BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xd0 0x2568 #BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xd1 0x2564 #BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0xd2 0x2565 #BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0xd3 0x2559 #BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xd4 0x2558 #BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xd5 0x2552 #BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xd6 0x2553 #BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0xd7 0x256b #BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0xd8 0x256a #BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x258c #LEFT HALF BLOCK +0xde 0x2590 #RIGHT HALF BLOCK +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x03b1 #GREEK SMALL LETTER ALPHA +0xe1 0x00df #LATIN SMALL LETTER SHARP S +0xe2 0x0393 #GREEK CAPITAL LETTER GAMMA +0xe3 0x03c0 #GREEK SMALL LETTER PI +0xe4 0x03a3 #GREEK CAPITAL LETTER SIGMA +0xe5 0x03c3 #GREEK SMALL LETTER SIGMA +0xe6 0x00b5 #MICRO SIGN +0xe7 0x03c4 #GREEK SMALL LETTER TAU +0xe8 0x03a6 #GREEK CAPITAL LETTER PHI +0xe9 0x0398 #GREEK CAPITAL LETTER THETA +0xea 0x03a9 #GREEK CAPITAL LETTER OMEGA +0xeb 0x03b4 #GREEK SMALL LETTER DELTA +0xec 0x221e #INFINITY +0xed 0x03c6 #GREEK SMALL LETTER PHI +0xee 0x03b5 #GREEK SMALL LETTER EPSILON +0xef 0x2229 #INTERSECTION +0xf0 0x2261 #IDENTICAL TO +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 0x2265 #GREATER-THAN OR EQUAL TO +0xf3 0x2264 #LESS-THAN OR EQUAL TO +0xf4 0x2320 #TOP HALF INTEGRAL +0xf5 0x2321 #BOTTOM HALF INTEGRAL +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x2248 #ALMOST EQUAL TO +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x2219 #BULLET OPERATOR +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x221a #SQUARE ROOT +0xfc 0x207f #SUPERSCRIPT LATIN SMALL LETTER N +0xfd 0x00b2 #SUPERSCRIPT TWO +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + \ No newline at end of file diff --git a/charsets/cp850.txt b/charsets/cp850.txt new file mode 100644 index 0000000..312e439 --- /dev/null +++ b/charsets/cp850.txt @@ -0,0 +1,274 @@ +# +# Name: cp850_DOSLatin1 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp850_DOSLatin1 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp850_DOSLatin1 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x81 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x82 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x83 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x84 0x00e4 #LATIN SMALL LETTER A WITH DIAERESIS +0x85 0x00e0 #LATIN SMALL LETTER A WITH GRAVE +0x86 0x00e5 #LATIN SMALL LETTER A WITH RING ABOVE +0x87 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x88 0x00ea #LATIN SMALL LETTER E WITH CIRCUMFLEX +0x89 0x00eb #LATIN SMALL LETTER E WITH DIAERESIS +0x8a 0x00e8 #LATIN SMALL LETTER E WITH GRAVE +0x8b 0x00ef #LATIN SMALL LETTER I WITH DIAERESIS +0x8c 0x00ee #LATIN SMALL LETTER I WITH CIRCUMFLEX +0x8d 0x00ec #LATIN SMALL LETTER I WITH GRAVE +0x8e 0x00c4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0x8f 0x00c5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0x90 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x91 0x00e6 #LATIN SMALL LIGATURE AE +0x92 0x00c6 #LATIN CAPITAL LIGATURE AE +0x93 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x94 0x00f6 #LATIN SMALL LETTER O WITH DIAERESIS +0x95 0x00f2 #LATIN SMALL LETTER O WITH GRAVE +0x96 0x00fb #LATIN SMALL LETTER U WITH CIRCUMFLEX +0x97 0x00f9 #LATIN SMALL LETTER U WITH GRAVE +0x98 0x00ff #LATIN SMALL LETTER Y WITH DIAERESIS +0x99 0x00d6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0x9a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x9b 0x00f8 #LATIN SMALL LETTER O WITH STROKE +0x9c 0x00a3 #POUND SIGN +0x9d 0x00d8 #LATIN CAPITAL LETTER O WITH STROKE +0x9e 0x00d7 #MULTIPLICATION SIGN +0x9f 0x0192 #LATIN SMALL LETTER F WITH HOOK +0xa0 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0xa1 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x00f1 #LATIN SMALL LETTER N WITH TILDE +0xa5 0x00d1 #LATIN CAPITAL LETTER N WITH TILDE +0xa6 0x00aa #FEMININE ORDINAL INDICATOR +0xa7 0x00ba #MASCULINE ORDINAL INDICATOR +0xa8 0x00bf #INVERTED QUESTION MARK +0xa9 0x00ae #REGISTERED SIGN +0xaa 0x00ac #NOT SIGN +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x00bc #VULGAR FRACTION ONE QUARTER +0xad 0x00a1 #INVERTED EXCLAMATION MARK +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x00c1 #LATIN CAPITAL LETTER A WITH ACUTE +0xb6 0x00c2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xb7 0x00c0 #LATIN CAPITAL LETTER A WITH GRAVE +0xb8 0x00a9 #COPYRIGHT SIGN +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x00a2 #CENT SIGN +0xbe 0x00a5 #YEN SIGN +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x00e3 #LATIN SMALL LETTER A WITH TILDE +0xc7 0x00c3 #LATIN CAPITAL LETTER A WITH TILDE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x00a4 #CURRENCY SIGN +0xd0 0x00f0 #LATIN SMALL LETTER ETH +0xd1 0x00d0 #LATIN CAPITAL LETTER ETH +0xd2 0x00ca #LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xd3 0x00cb #LATIN CAPITAL LETTER E WITH DIAERESIS +0xd4 0x00c8 #LATIN CAPITAL LETTER E WITH GRAVE +0xd5 0x0131 #LATIN SMALL LETTER DOTLESS I +0xd6 0x00cd #LATIN CAPITAL LETTER I WITH ACUTE +0xd7 0x00ce #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xd8 0x00cf #LATIN CAPITAL LETTER I WITH DIAERESIS +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x00a6 #BROKEN BAR +0xde 0x00cc #LATIN CAPITAL LETTER I WITH GRAVE +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x00d3 #LATIN CAPITAL LETTER O WITH ACUTE +0xe1 0x00df #LATIN SMALL LETTER SHARP S +0xe2 0x00d4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xe3 0x00d2 #LATIN CAPITAL LETTER O WITH GRAVE +0xe4 0x00f5 #LATIN SMALL LETTER O WITH TILDE +0xe5 0x00d5 #LATIN CAPITAL LETTER O WITH TILDE +0xe6 0x00b5 #MICRO SIGN +0xe7 0x00fe #LATIN SMALL LETTER THORN +0xe8 0x00de #LATIN CAPITAL LETTER THORN +0xe9 0x00da #LATIN CAPITAL LETTER U WITH ACUTE +0xea 0x00db #LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xeb 0x00d9 #LATIN CAPITAL LETTER U WITH GRAVE +0xec 0x00fd #LATIN SMALL LETTER Y WITH ACUTE +0xed 0x00dd #LATIN CAPITAL LETTER Y WITH ACUTE +0xee 0x00af #MACRON +0xef 0x00b4 #ACUTE ACCENT +0xf0 0x00ad #SOFT HYPHEN +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 0x2017 #DOUBLE LOW LINE +0xf3 0x00be #VULGAR FRACTION THREE QUARTERS +0xf4 0x00b6 #PILCROW SIGN +0xf5 0x00a7 #SECTION SIGN +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x00b8 #CEDILLA +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x00a8 #DIAERESIS +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x00b9 #SUPERSCRIPT ONE +0xfc 0x00b3 #SUPERSCRIPT THREE +0xfd 0x00b2 #SUPERSCRIPT TWO +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + \ No newline at end of file diff --git a/charsets/cp852.txt b/charsets/cp852.txt new file mode 100644 index 0000000..bae9e7a --- /dev/null +++ b/charsets/cp852.txt @@ -0,0 +1,274 @@ +# +# Name: cp852_DOSLatin2 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp852_DOSLatin2 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp852_DOSLatin2 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x81 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x82 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x83 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x84 0x00e4 #LATIN SMALL LETTER A WITH DIAERESIS +0x85 0x016f #LATIN SMALL LETTER U WITH RING ABOVE +0x86 0x0107 #LATIN SMALL LETTER C WITH ACUTE +0x87 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x88 0x0142 #LATIN SMALL LETTER L WITH STROKE +0x89 0x00eb #LATIN SMALL LETTER E WITH DIAERESIS +0x8a 0x0150 #LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0x8b 0x0151 #LATIN SMALL LETTER O WITH DOUBLE ACUTE +0x8c 0x00ee #LATIN SMALL LETTER I WITH CIRCUMFLEX +0x8d 0x0179 #LATIN CAPITAL LETTER Z WITH ACUTE +0x8e 0x00c4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0x8f 0x0106 #LATIN CAPITAL LETTER C WITH ACUTE +0x90 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x91 0x0139 #LATIN CAPITAL LETTER L WITH ACUTE +0x92 0x013a #LATIN SMALL LETTER L WITH ACUTE +0x93 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x94 0x00f6 #LATIN SMALL LETTER O WITH DIAERESIS +0x95 0x013d #LATIN CAPITAL LETTER L WITH CARON +0x96 0x013e #LATIN SMALL LETTER L WITH CARON +0x97 0x015a #LATIN CAPITAL LETTER S WITH ACUTE +0x98 0x015b #LATIN SMALL LETTER S WITH ACUTE +0x99 0x00d6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0x9a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x9b 0x0164 #LATIN CAPITAL LETTER T WITH CARON +0x9c 0x0165 #LATIN SMALL LETTER T WITH CARON +0x9d 0x0141 #LATIN CAPITAL LETTER L WITH STROKE +0x9e 0x00d7 #MULTIPLICATION SIGN +0x9f 0x010d #LATIN SMALL LETTER C WITH CARON +0xa0 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0xa1 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x0104 #LATIN CAPITAL LETTER A WITH OGONEK +0xa5 0x0105 #LATIN SMALL LETTER A WITH OGONEK +0xa6 0x017d #LATIN CAPITAL LETTER Z WITH CARON +0xa7 0x017e #LATIN SMALL LETTER Z WITH CARON +0xa8 0x0118 #LATIN CAPITAL LETTER E WITH OGONEK +0xa9 0x0119 #LATIN SMALL LETTER E WITH OGONEK +0xaa 0x00ac #NOT SIGN +0xab 0x017a #LATIN SMALL LETTER Z WITH ACUTE +0xac 0x010c #LATIN CAPITAL LETTER C WITH CARON +0xad 0x015f #LATIN SMALL LETTER S WITH CEDILLA +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x00c1 #LATIN CAPITAL LETTER A WITH ACUTE +0xb6 0x00c2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xb7 0x011a #LATIN CAPITAL LETTER E WITH CARON +0xb8 0x015e #LATIN CAPITAL LETTER S WITH CEDILLA +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x017b #LATIN CAPITAL LETTER Z WITH DOT ABOVE +0xbe 0x017c #LATIN SMALL LETTER Z WITH DOT ABOVE +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x0102 #LATIN CAPITAL LETTER A WITH BREVE +0xc7 0x0103 #LATIN SMALL LETTER A WITH BREVE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x00a4 #CURRENCY SIGN +0xd0 0x0111 #LATIN SMALL LETTER D WITH STROKE +0xd1 0x0110 #LATIN CAPITAL LETTER D WITH STROKE +0xd2 0x010e #LATIN CAPITAL LETTER D WITH CARON +0xd3 0x00cb #LATIN CAPITAL LETTER E WITH DIAERESIS +0xd4 0x010f #LATIN SMALL LETTER D WITH CARON +0xd5 0x0147 #LATIN CAPITAL LETTER N WITH CARON +0xd6 0x00cd #LATIN CAPITAL LETTER I WITH ACUTE +0xd7 0x00ce #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xd8 0x011b #LATIN SMALL LETTER E WITH CARON +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x0162 #LATIN CAPITAL LETTER T WITH CEDILLA +0xde 0x016e #LATIN CAPITAL LETTER U WITH RING ABOVE +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x00d3 #LATIN CAPITAL LETTER O WITH ACUTE +0xe1 0x00df #LATIN SMALL LETTER SHARP S +0xe2 0x00d4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xe3 0x0143 #LATIN CAPITAL LETTER N WITH ACUTE +0xe4 0x0144 #LATIN SMALL LETTER N WITH ACUTE +0xe5 0x0148 #LATIN SMALL LETTER N WITH CARON +0xe6 0x0160 #LATIN CAPITAL LETTER S WITH CARON +0xe7 0x0161 #LATIN SMALL LETTER S WITH CARON +0xe8 0x0154 #LATIN CAPITAL LETTER R WITH ACUTE +0xe9 0x00da #LATIN CAPITAL LETTER U WITH ACUTE +0xea 0x0155 #LATIN SMALL LETTER R WITH ACUTE +0xeb 0x0170 #LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0xec 0x00fd #LATIN SMALL LETTER Y WITH ACUTE +0xed 0x00dd #LATIN CAPITAL LETTER Y WITH ACUTE +0xee 0x0163 #LATIN SMALL LETTER T WITH CEDILLA +0xef 0x00b4 #ACUTE ACCENT +0xf0 0x00ad #SOFT HYPHEN +0xf1 0x02dd #DOUBLE ACUTE ACCENT +0xf2 0x02db #OGONEK +0xf3 0x02c7 #CARON +0xf4 0x02d8 #BREVE +0xf5 0x00a7 #SECTION SIGN +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x00b8 #CEDILLA +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x00a8 #DIAERESIS +0xfa 0x02d9 #DOT ABOVE +0xfb 0x0171 #LATIN SMALL LETTER U WITH DOUBLE ACUTE +0xfc 0x0158 #LATIN CAPITAL LETTER R WITH CARON +0xfd 0x0159 #LATIN SMALL LETTER R WITH CARON +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + \ No newline at end of file diff --git a/charsets/cp855.txt b/charsets/cp855.txt new file mode 100644 index 0000000..af0168d --- /dev/null +++ b/charsets/cp855.txt @@ -0,0 +1,275 @@ +# +# Name: cp855_DOSCyrillic to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp855_DOSCyrillic code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp855_DOSCyrillic order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x0452 #CYRILLIC SMALL LETTER DJE +0x81 0x0402 #CYRILLIC CAPITAL LETTER DJE +0x82 0x0453 #CYRILLIC SMALL LETTER GJE +0x83 0x0403 #CYRILLIC CAPITAL LETTER GJE +0x84 0x0451 #CYRILLIC SMALL LETTER IO +0x85 0x0401 #CYRILLIC CAPITAL LETTER IO +0x86 0x0454 #CYRILLIC SMALL LETTER UKRAINIAN IE +0x87 0x0404 #CYRILLIC CAPITAL LETTER UKRAINIAN IE +0x88 0x0455 #CYRILLIC SMALL LETTER DZE +0x89 0x0405 #CYRILLIC CAPITAL LETTER DZE +0x8a 0x0456 #CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +0x8b 0x0406 #CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0x8c 0x0457 #CYRILLIC SMALL LETTER YI +0x8d 0x0407 #CYRILLIC CAPITAL LETTER YI +0x8e 0x0458 #CYRILLIC SMALL LETTER JE +0x8f 0x0408 #CYRILLIC CAPITAL LETTER JE +0x90 0x0459 #CYRILLIC SMALL LETTER LJE +0x91 0x0409 #CYRILLIC CAPITAL LETTER LJE +0x92 0x045a #CYRILLIC SMALL LETTER NJE +0x93 0x040a #CYRILLIC CAPITAL LETTER NJE +0x94 0x045b #CYRILLIC SMALL LETTER TSHE +0x95 0x040b #CYRILLIC CAPITAL LETTER TSHE +0x96 0x045c #CYRILLIC SMALL LETTER KJE +0x97 0x040c #CYRILLIC CAPITAL LETTER KJE +0x98 0x045e #CYRILLIC SMALL LETTER SHORT U +0x99 0x040e #CYRILLIC CAPITAL LETTER SHORT U +0x9a 0x045f #CYRILLIC SMALL LETTER DZHE +0x9b 0x040f #CYRILLIC CAPITAL LETTER DZHE +0x9c 0x044e #CYRILLIC SMALL LETTER YU +0x9d 0x042e #CYRILLIC CAPITAL LETTER YU +0x9e 0x044a #CYRILLIC SMALL LETTER HARD SIGN +0x9f 0x042a #CYRILLIC CAPITAL LETTER HARD SIGN +0xa0 0x0430 #CYRILLIC SMALL LETTER A +0xa1 0x0410 #CYRILLIC CAPITAL LETTER A +0xa2 0x0431 #CYRILLIC SMALL LETTER BE +0xa3 0x0411 #CYRILLIC CAPITAL LETTER BE +0xa4 0x0446 #CYRILLIC SMALL LETTER TSE +0xa5 0x0426 #CYRILLIC CAPITAL LETTER TSE +0xa6 0x0434 #CYRILLIC SMALL LETTER DE +0xa7 0x0414 #CYRILLIC CAPITAL LETTER DE +0xa8 0x0435 #CYRILLIC SMALL LETTER IE +0xa9 0x0415 #CYRILLIC CAPITAL LETTER IE +0xaa 0x0444 #CYRILLIC SMALL LETTER EF +0xab 0x0424 #CYRILLIC CAPITAL LETTER EF +0xac 0x0433 #CYRILLIC SMALL LETTER GHE +0xad 0x0413 #CYRILLIC CAPITAL LETTER GHE +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x0445 #CYRILLIC SMALL LETTER HA +0xb6 0x0425 #CYRILLIC CAPITAL LETTER HA +0xb7 0x0438 #CYRILLIC SMALL LETTER I +0xb8 0x0418 #CYRILLIC CAPITAL LETTER I +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x0439 #CYRILLIC SMALL LETTER SHORT I +0xbe 0x0419 #CYRILLIC CAPITAL LETTER SHORT I +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x043a #CYRILLIC SMALL LETTER KA +0xc7 0x041a #CYRILLIC CAPITAL LETTER KA +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x00a4 #CURRENCY SIGN +0xd0 0x043b #CYRILLIC SMALL LETTER EL +0xd1 0x041b #CYRILLIC CAPITAL LETTER EL +0xd2 0x043c #CYRILLIC SMALL LETTER EM +0xd3 0x041c #CYRILLIC CAPITAL LETTER EM +0xd4 0x043d #CYRILLIC SMALL LETTER EN +0xd5 0x041d #CYRILLIC CAPITAL LETTER EN +0xd6 0x043e #CYRILLIC SMALL LETTER O +0xd7 0x041e #CYRILLIC CAPITAL LETTER O +0xd8 0x043f #CYRILLIC SMALL LETTER PE +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x041f #CYRILLIC CAPITAL LETTER PE +0xde 0x044f #CYRILLIC SMALL LETTER YA +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x042f #CYRILLIC CAPITAL LETTER YA +0xe1 0x0440 #CYRILLIC SMALL LETTER ER +0xe2 0x0420 #CYRILLIC CAPITAL LETTER ER +0xe3 0x0441 #CYRILLIC SMALL LETTER ES +0xe4 0x0421 #CYRILLIC CAPITAL LETTER ES +0xe5 0x0442 #CYRILLIC SMALL LETTER TE +0xe6 0x0422 #CYRILLIC CAPITAL LETTER TE +0xe7 0x0443 #CYRILLIC SMALL LETTER U +0xe8 0x0423 #CYRILLIC CAPITAL LETTER U +0xe9 0x0436 #CYRILLIC SMALL LETTER ZHE +0xea 0x0416 #CYRILLIC CAPITAL LETTER ZHE +0xeb 0x0432 #CYRILLIC SMALL LETTER VE +0xec 0x0412 #CYRILLIC CAPITAL LETTER VE +0xed 0x044c #CYRILLIC SMALL LETTER SOFT SIGN +0xee 0x042c #CYRILLIC CAPITAL LETTER SOFT SIGN +0xef 0x2116 #NUMERO SIGN +0xf0 0x00ad #SOFT HYPHEN +0xf1 0x044b #CYRILLIC SMALL LETTER YERU +0xf2 0x042b #CYRILLIC CAPITAL LETTER YERU +0xf3 0x0437 #CYRILLIC SMALL LETTER ZE +0xf4 0x0417 #CYRILLIC CAPITAL LETTER ZE +0xf5 0x0448 #CYRILLIC SMALL LETTER SHA +0xf6 0x0428 #CYRILLIC CAPITAL LETTER SHA +0xf7 0x044d #CYRILLIC SMALL LETTER E +0xf8 0x042d #CYRILLIC CAPITAL LETTER E +0xf9 0x0449 #CYRILLIC SMALL LETTER SHCHA +0xfa 0x0429 #CYRILLIC CAPITAL LETTER SHCHA +0xfb 0x0447 #CYRILLIC SMALL LETTER CHE +0xfc 0x0427 #CYRILLIC CAPITAL LETTER CHE +0xfd 0x00a7 #SECTION SIGN +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp857.txt b/charsets/cp857.txt new file mode 100644 index 0000000..7dde69f --- /dev/null +++ b/charsets/cp857.txt @@ -0,0 +1,275 @@ +# +# Name: cp857_DOSTurkish to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp857_DOSTurkish code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp857_DOSTurkish order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x81 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x82 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x83 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x84 0x00e4 #LATIN SMALL LETTER A WITH DIAERESIS +0x85 0x00e0 #LATIN SMALL LETTER A WITH GRAVE +0x86 0x00e5 #LATIN SMALL LETTER A WITH RING ABOVE +0x87 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x88 0x00ea #LATIN SMALL LETTER E WITH CIRCUMFLEX +0x89 0x00eb #LATIN SMALL LETTER E WITH DIAERESIS +0x8a 0x00e8 #LATIN SMALL LETTER E WITH GRAVE +0x8b 0x00ef #LATIN SMALL LETTER I WITH DIAERESIS +0x8c 0x00ee #LATIN SMALL LETTER I WITH CIRCUMFLEX +0x8d 0x0131 #LATIN SMALL LETTER DOTLESS I +0x8e 0x00c4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0x8f 0x00c5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0x90 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x91 0x00e6 #LATIN SMALL LIGATURE AE +0x92 0x00c6 #LATIN CAPITAL LIGATURE AE +0x93 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x94 0x00f6 #LATIN SMALL LETTER O WITH DIAERESIS +0x95 0x00f2 #LATIN SMALL LETTER O WITH GRAVE +0x96 0x00fb #LATIN SMALL LETTER U WITH CIRCUMFLEX +0x97 0x00f9 #LATIN SMALL LETTER U WITH GRAVE +0x98 0x0130 #LATIN CAPITAL LETTER I WITH DOT ABOVE +0x99 0x00d6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0x9a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x9b 0x00f8 #LATIN SMALL LETTER O WITH STROKE +0x9c 0x00a3 #POUND SIGN +0x9d 0x00d8 #LATIN CAPITAL LETTER O WITH STROKE +0x9e 0x015e #LATIN CAPITAL LETTER S WITH CEDILLA +0x9f 0x015f #LATIN SMALL LETTER S WITH CEDILLA +0xa0 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0xa1 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x00f1 #LATIN SMALL LETTER N WITH TILDE +0xa5 0x00d1 #LATIN CAPITAL LETTER N WITH TILDE +0xa6 0x011e #LATIN CAPITAL LETTER G WITH BREVE +0xa7 0x011f #LATIN SMALL LETTER G WITH BREVE +0xa8 0x00bf #INVERTED QUESTION MARK +0xa9 0x00ae #REGISTERED SIGN +0xaa 0x00ac #NOT SIGN +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x00bc #VULGAR FRACTION ONE QUARTER +0xad 0x00a1 #INVERTED EXCLAMATION MARK +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x00c1 #LATIN CAPITAL LETTER A WITH ACUTE +0xb6 0x00c2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xb7 0x00c0 #LATIN CAPITAL LETTER A WITH GRAVE +0xb8 0x00a9 #COPYRIGHT SIGN +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x00a2 #CENT SIGN +0xbe 0x00a5 #YEN SIGN +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x00e3 #LATIN SMALL LETTER A WITH TILDE +0xc7 0x00c3 #LATIN CAPITAL LETTER A WITH TILDE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x00a4 #CURRENCY SIGN +0xd0 0x00ba #MASCULINE ORDINAL INDICATOR +0xd1 0x00aa #FEMININE ORDINAL INDICATOR +0xd2 0x00ca #LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xd3 0x00cb #LATIN CAPITAL LETTER E WITH DIAERESIS +0xd4 0x00c8 #LATIN CAPITAL LETTER E WITH GRAVE +0xd5 #UNDEFINED +0xd6 0x00cd #LATIN CAPITAL LETTER I WITH ACUTE +0xd7 0x00ce #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xd8 0x00cf #LATIN CAPITAL LETTER I WITH DIAERESIS +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x00a6 #BROKEN BAR +0xde 0x00cc #LATIN CAPITAL LETTER I WITH GRAVE +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x00d3 #LATIN CAPITAL LETTER O WITH ACUTE +0xe1 0x00df #LATIN SMALL LETTER SHARP S +0xe2 0x00d4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xe3 0x00d2 #LATIN CAPITAL LETTER O WITH GRAVE +0xe4 0x00f5 #LATIN SMALL LETTER O WITH TILDE +0xe5 0x00d5 #LATIN CAPITAL LETTER O WITH TILDE +0xe6 0x00b5 #MICRO SIGN +0xe7 #UNDEFINED +0xe8 0x00d7 #MULTIPLICATION SIGN +0xe9 0x00da #LATIN CAPITAL LETTER U WITH ACUTE +0xea 0x00db #LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xeb 0x00d9 #LATIN CAPITAL LETTER U WITH GRAVE +0xec 0x00ec #LATIN SMALL LETTER I WITH GRAVE +0xed 0x00ff #LATIN SMALL LETTER Y WITH DIAERESIS +0xee 0x00af #MACRON +0xef 0x00b4 #ACUTE ACCENT +0xf0 0x00ad #SOFT HYPHEN +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 #UNDEFINED +0xf3 0x00be #VULGAR FRACTION THREE QUARTERS +0xf4 0x00b6 #PILCROW SIGN +0xf5 0x00a7 #SECTION SIGN +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x00b8 #CEDILLA +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x00a8 #DIAERESIS +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x00b9 #SUPERSCRIPT ONE +0xfc 0x00b3 #SUPERSCRIPT THREE +0xfd 0x00b2 #SUPERSCRIPT TWO +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp860.txt b/charsets/cp860.txt new file mode 100644 index 0000000..ce36f21 --- /dev/null +++ b/charsets/cp860.txt @@ -0,0 +1,275 @@ +# +# Name: cp860_DOSPortuguese to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp860_DOSPortuguese code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp860_DOSPortuguese order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x81 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x82 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x83 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x84 0x00e3 #LATIN SMALL LETTER A WITH TILDE +0x85 0x00e0 #LATIN SMALL LETTER A WITH GRAVE +0x86 0x00c1 #LATIN CAPITAL LETTER A WITH ACUTE +0x87 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x88 0x00ea #LATIN SMALL LETTER E WITH CIRCUMFLEX +0x89 0x00ca #LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0x8a 0x00e8 #LATIN SMALL LETTER E WITH GRAVE +0x8b 0x00cd #LATIN CAPITAL LETTER I WITH ACUTE +0x8c 0x00d4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0x8d 0x00ec #LATIN SMALL LETTER I WITH GRAVE +0x8e 0x00c3 #LATIN CAPITAL LETTER A WITH TILDE +0x8f 0x00c2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0x90 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x91 0x00c0 #LATIN CAPITAL LETTER A WITH GRAVE +0x92 0x00c8 #LATIN CAPITAL LETTER E WITH GRAVE +0x93 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x94 0x00f5 #LATIN SMALL LETTER O WITH TILDE +0x95 0x00f2 #LATIN SMALL LETTER O WITH GRAVE +0x96 0x00da #LATIN CAPITAL LETTER U WITH ACUTE +0x97 0x00f9 #LATIN SMALL LETTER U WITH GRAVE +0x98 0x00cc #LATIN CAPITAL LETTER I WITH GRAVE +0x99 0x00d5 #LATIN CAPITAL LETTER O WITH TILDE +0x9a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x9b 0x00a2 #CENT SIGN +0x9c 0x00a3 #POUND SIGN +0x9d 0x00d9 #LATIN CAPITAL LETTER U WITH GRAVE +0x9e 0x20a7 #PESETA SIGN +0x9f 0x00d3 #LATIN CAPITAL LETTER O WITH ACUTE +0xa0 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0xa1 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x00f1 #LATIN SMALL LETTER N WITH TILDE +0xa5 0x00d1 #LATIN CAPITAL LETTER N WITH TILDE +0xa6 0x00aa #FEMININE ORDINAL INDICATOR +0xa7 0x00ba #MASCULINE ORDINAL INDICATOR +0xa8 0x00bf #INVERTED QUESTION MARK +0xa9 0x00d2 #LATIN CAPITAL LETTER O WITH GRAVE +0xaa 0x00ac #NOT SIGN +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x00bc #VULGAR FRACTION ONE QUARTER +0xad 0x00a1 #INVERTED EXCLAMATION MARK +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x2561 #BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xb6 0x2562 #BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0xb7 0x2556 #BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0xb8 0x2555 #BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x255c #BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0xbe 0x255b #BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x255e #BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xc7 0x255f #BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x2567 #BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xd0 0x2568 #BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xd1 0x2564 #BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0xd2 0x2565 #BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0xd3 0x2559 #BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xd4 0x2558 #BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xd5 0x2552 #BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xd6 0x2553 #BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0xd7 0x256b #BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0xd8 0x256a #BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x258c #LEFT HALF BLOCK +0xde 0x2590 #RIGHT HALF BLOCK +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x03b1 #GREEK SMALL LETTER ALPHA +0xe1 0x00df #LATIN SMALL LETTER SHARP S +0xe2 0x0393 #GREEK CAPITAL LETTER GAMMA +0xe3 0x03c0 #GREEK SMALL LETTER PI +0xe4 0x03a3 #GREEK CAPITAL LETTER SIGMA +0xe5 0x03c3 #GREEK SMALL LETTER SIGMA +0xe6 0x00b5 #MICRO SIGN +0xe7 0x03c4 #GREEK SMALL LETTER TAU +0xe8 0x03a6 #GREEK CAPITAL LETTER PHI +0xe9 0x0398 #GREEK CAPITAL LETTER THETA +0xea 0x03a9 #GREEK CAPITAL LETTER OMEGA +0xeb 0x03b4 #GREEK SMALL LETTER DELTA +0xec 0x221e #INFINITY +0xed 0x03c6 #GREEK SMALL LETTER PHI +0xee 0x03b5 #GREEK SMALL LETTER EPSILON +0xef 0x2229 #INTERSECTION +0xf0 0x2261 #IDENTICAL TO +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 0x2265 #GREATER-THAN OR EQUAL TO +0xf3 0x2264 #LESS-THAN OR EQUAL TO +0xf4 0x2320 #TOP HALF INTEGRAL +0xf5 0x2321 #BOTTOM HALF INTEGRAL +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x2248 #ALMOST EQUAL TO +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x2219 #BULLET OPERATOR +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x221a #SQUARE ROOT +0xfc 0x207f #SUPERSCRIPT LATIN SMALL LETTER N +0xfd 0x00b2 #SUPERSCRIPT TWO +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp861.txt b/charsets/cp861.txt new file mode 100644 index 0000000..5a50e4e --- /dev/null +++ b/charsets/cp861.txt @@ -0,0 +1,275 @@ +# +# Name: cp861_DOSIcelandic to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp861_DOSIcelandic code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp861_DOSIcelandic order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x81 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x82 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x83 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x84 0x00e4 #LATIN SMALL LETTER A WITH DIAERESIS +0x85 0x00e0 #LATIN SMALL LETTER A WITH GRAVE +0x86 0x00e5 #LATIN SMALL LETTER A WITH RING ABOVE +0x87 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x88 0x00ea #LATIN SMALL LETTER E WITH CIRCUMFLEX +0x89 0x00eb #LATIN SMALL LETTER E WITH DIAERESIS +0x8a 0x00e8 #LATIN SMALL LETTER E WITH GRAVE +0x8b 0x00d0 #LATIN CAPITAL LETTER ETH +0x8c 0x00f0 #LATIN SMALL LETTER ETH +0x8d 0x00de #LATIN CAPITAL LETTER THORN +0x8e 0x00c4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0x8f 0x00c5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0x90 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x91 0x00e6 #LATIN SMALL LIGATURE AE +0x92 0x00c6 #LATIN CAPITAL LIGATURE AE +0x93 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x94 0x00f6 #LATIN SMALL LETTER O WITH DIAERESIS +0x95 0x00fe #LATIN SMALL LETTER THORN +0x96 0x00fb #LATIN SMALL LETTER U WITH CIRCUMFLEX +0x97 0x00dd #LATIN CAPITAL LETTER Y WITH ACUTE +0x98 0x00fd #LATIN SMALL LETTER Y WITH ACUTE +0x99 0x00d6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0x9a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x9b 0x00f8 #LATIN SMALL LETTER O WITH STROKE +0x9c 0x00a3 #POUND SIGN +0x9d 0x00d8 #LATIN CAPITAL LETTER O WITH STROKE +0x9e 0x20a7 #PESETA SIGN +0x9f 0x0192 #LATIN SMALL LETTER F WITH HOOK +0xa0 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0xa1 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x00c1 #LATIN CAPITAL LETTER A WITH ACUTE +0xa5 0x00cd #LATIN CAPITAL LETTER I WITH ACUTE +0xa6 0x00d3 #LATIN CAPITAL LETTER O WITH ACUTE +0xa7 0x00da #LATIN CAPITAL LETTER U WITH ACUTE +0xa8 0x00bf #INVERTED QUESTION MARK +0xa9 0x2310 #REVERSED NOT SIGN +0xaa 0x00ac #NOT SIGN +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x00bc #VULGAR FRACTION ONE QUARTER +0xad 0x00a1 #INVERTED EXCLAMATION MARK +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x2561 #BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xb6 0x2562 #BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0xb7 0x2556 #BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0xb8 0x2555 #BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x255c #BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0xbe 0x255b #BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x255e #BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xc7 0x255f #BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x2567 #BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xd0 0x2568 #BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xd1 0x2564 #BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0xd2 0x2565 #BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0xd3 0x2559 #BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xd4 0x2558 #BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xd5 0x2552 #BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xd6 0x2553 #BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0xd7 0x256b #BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0xd8 0x256a #BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x258c #LEFT HALF BLOCK +0xde 0x2590 #RIGHT HALF BLOCK +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x03b1 #GREEK SMALL LETTER ALPHA +0xe1 0x00df #LATIN SMALL LETTER SHARP S +0xe2 0x0393 #GREEK CAPITAL LETTER GAMMA +0xe3 0x03c0 #GREEK SMALL LETTER PI +0xe4 0x03a3 #GREEK CAPITAL LETTER SIGMA +0xe5 0x03c3 #GREEK SMALL LETTER SIGMA +0xe6 0x00b5 #MICRO SIGN +0xe7 0x03c4 #GREEK SMALL LETTER TAU +0xe8 0x03a6 #GREEK CAPITAL LETTER PHI +0xe9 0x0398 #GREEK CAPITAL LETTER THETA +0xea 0x03a9 #GREEK CAPITAL LETTER OMEGA +0xeb 0x03b4 #GREEK SMALL LETTER DELTA +0xec 0x221e #INFINITY +0xed 0x03c6 #GREEK SMALL LETTER PHI +0xee 0x03b5 #GREEK SMALL LETTER EPSILON +0xef 0x2229 #INTERSECTION +0xf0 0x2261 #IDENTICAL TO +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 0x2265 #GREATER-THAN OR EQUAL TO +0xf3 0x2264 #LESS-THAN OR EQUAL TO +0xf4 0x2320 #TOP HALF INTEGRAL +0xf5 0x2321 #BOTTOM HALF INTEGRAL +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x2248 #ALMOST EQUAL TO +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x2219 #BULLET OPERATOR +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x221a #SQUARE ROOT +0xfc 0x207f #SUPERSCRIPT LATIN SMALL LETTER N +0xfd 0x00b2 #SUPERSCRIPT TWO +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp862.txt b/charsets/cp862.txt new file mode 100644 index 0000000..5a4f019 --- /dev/null +++ b/charsets/cp862.txt @@ -0,0 +1,275 @@ +# +# Name: cp862_DOSHebrew to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp862_DOSHebrew code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp862_DOSHebrew order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x05d0 #HEBREW LETTER ALEF +0x81 0x05d1 #HEBREW LETTER BET +0x82 0x05d2 #HEBREW LETTER GIMEL +0x83 0x05d3 #HEBREW LETTER DALET +0x84 0x05d4 #HEBREW LETTER HE +0x85 0x05d5 #HEBREW LETTER VAV +0x86 0x05d6 #HEBREW LETTER ZAYIN +0x87 0x05d7 #HEBREW LETTER HET +0x88 0x05d8 #HEBREW LETTER TET +0x89 0x05d9 #HEBREW LETTER YOD +0x8a 0x05da #HEBREW LETTER FINAL KAF +0x8b 0x05db #HEBREW LETTER KAF +0x8c 0x05dc #HEBREW LETTER LAMED +0x8d 0x05dd #HEBREW LETTER FINAL MEM +0x8e 0x05de #HEBREW LETTER MEM +0x8f 0x05df #HEBREW LETTER FINAL NUN +0x90 0x05e0 #HEBREW LETTER NUN +0x91 0x05e1 #HEBREW LETTER SAMEKH +0x92 0x05e2 #HEBREW LETTER AYIN +0x93 0x05e3 #HEBREW LETTER FINAL PE +0x94 0x05e4 #HEBREW LETTER PE +0x95 0x05e5 #HEBREW LETTER FINAL TSADI +0x96 0x05e6 #HEBREW LETTER TSADI +0x97 0x05e7 #HEBREW LETTER QOF +0x98 0x05e8 #HEBREW LETTER RESH +0x99 0x05e9 #HEBREW LETTER SHIN +0x9a 0x05ea #HEBREW LETTER TAV +0x9b 0x00a2 #CENT SIGN +0x9c 0x00a3 #POUND SIGN +0x9d 0x00a5 #YEN SIGN +0x9e 0x20a7 #PESETA SIGN +0x9f 0x0192 #LATIN SMALL LETTER F WITH HOOK +0xa0 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0xa1 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x00f1 #LATIN SMALL LETTER N WITH TILDE +0xa5 0x00d1 #LATIN CAPITAL LETTER N WITH TILDE +0xa6 0x00aa #FEMININE ORDINAL INDICATOR +0xa7 0x00ba #MASCULINE ORDINAL INDICATOR +0xa8 0x00bf #INVERTED QUESTION MARK +0xa9 0x2310 #REVERSED NOT SIGN +0xaa 0x00ac #NOT SIGN +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x00bc #VULGAR FRACTION ONE QUARTER +0xad 0x00a1 #INVERTED EXCLAMATION MARK +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x2561 #BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xb6 0x2562 #BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0xb7 0x2556 #BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0xb8 0x2555 #BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x255c #BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0xbe 0x255b #BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x255e #BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xc7 0x255f #BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x2567 #BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xd0 0x2568 #BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xd1 0x2564 #BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0xd2 0x2565 #BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0xd3 0x2559 #BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xd4 0x2558 #BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xd5 0x2552 #BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xd6 0x2553 #BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0xd7 0x256b #BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0xd8 0x256a #BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x258c #LEFT HALF BLOCK +0xde 0x2590 #RIGHT HALF BLOCK +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x03b1 #GREEK SMALL LETTER ALPHA +0xe1 0x00df #LATIN SMALL LETTER SHARP S (GERMAN) +0xe2 0x0393 #GREEK CAPITAL LETTER GAMMA +0xe3 0x03c0 #GREEK SMALL LETTER PI +0xe4 0x03a3 #GREEK CAPITAL LETTER SIGMA +0xe5 0x03c3 #GREEK SMALL LETTER SIGMA +0xe6 0x00b5 #MICRO SIGN +0xe7 0x03c4 #GREEK SMALL LETTER TAU +0xe8 0x03a6 #GREEK CAPITAL LETTER PHI +0xe9 0x0398 #GREEK CAPITAL LETTER THETA +0xea 0x03a9 #GREEK CAPITAL LETTER OMEGA +0xeb 0x03b4 #GREEK SMALL LETTER DELTA +0xec 0x221e #INFINITY +0xed 0x03c6 #GREEK SMALL LETTER PHI +0xee 0x03b5 #GREEK SMALL LETTER EPSILON +0xef 0x2229 #INTERSECTION +0xf0 0x2261 #IDENTICAL TO +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 0x2265 #GREATER-THAN OR EQUAL TO +0xf3 0x2264 #LESS-THAN OR EQUAL TO +0xf4 0x2320 #TOP HALF INTEGRAL +0xf5 0x2321 #BOTTOM HALF INTEGRAL +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x2248 #ALMOST EQUAL TO +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x2219 #BULLET OPERATOR +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x221a #SQUARE ROOT +0xfc 0x207f #SUPERSCRIPT LATIN SMALL LETTER N +0xfd 0x00b2 #SUPERSCRIPT TWO +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp863.txt b/charsets/cp863.txt new file mode 100644 index 0000000..115afbd --- /dev/null +++ b/charsets/cp863.txt @@ -0,0 +1,275 @@ +# +# Name: cp863_DOSCanadaF to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp863_DOSCanadaF code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp863_DOSCanadaF order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x81 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x82 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x83 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x84 0x00c2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0x85 0x00e0 #LATIN SMALL LETTER A WITH GRAVE +0x86 0x00b6 #PILCROW SIGN +0x87 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x88 0x00ea #LATIN SMALL LETTER E WITH CIRCUMFLEX +0x89 0x00eb #LATIN SMALL LETTER E WITH DIAERESIS +0x8a 0x00e8 #LATIN SMALL LETTER E WITH GRAVE +0x8b 0x00ef #LATIN SMALL LETTER I WITH DIAERESIS +0x8c 0x00ee #LATIN SMALL LETTER I WITH CIRCUMFLEX +0x8d 0x2017 #DOUBLE LOW LINE +0x8e 0x00c0 #LATIN CAPITAL LETTER A WITH GRAVE +0x8f 0x00a7 #SECTION SIGN +0x90 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x91 0x00c8 #LATIN CAPITAL LETTER E WITH GRAVE +0x92 0x00ca #LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0x93 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x94 0x00cb #LATIN CAPITAL LETTER E WITH DIAERESIS +0x95 0x00cf #LATIN CAPITAL LETTER I WITH DIAERESIS +0x96 0x00fb #LATIN SMALL LETTER U WITH CIRCUMFLEX +0x97 0x00f9 #LATIN SMALL LETTER U WITH GRAVE +0x98 0x00a4 #CURRENCY SIGN +0x99 0x00d4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0x9a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x9b 0x00a2 #CENT SIGN +0x9c 0x00a3 #POUND SIGN +0x9d 0x00d9 #LATIN CAPITAL LETTER U WITH GRAVE +0x9e 0x00db #LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0x9f 0x0192 #LATIN SMALL LETTER F WITH HOOK +0xa0 0x00a6 #BROKEN BAR +0xa1 0x00b4 #ACUTE ACCENT +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x00a8 #DIAERESIS +0xa5 0x00b8 #CEDILLA +0xa6 0x00b3 #SUPERSCRIPT THREE +0xa7 0x00af #MACRON +0xa8 0x00ce #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xa9 0x2310 #REVERSED NOT SIGN +0xaa 0x00ac #NOT SIGN +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x00bc #VULGAR FRACTION ONE QUARTER +0xad 0x00be #VULGAR FRACTION THREE QUARTERS +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x2561 #BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xb6 0x2562 #BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0xb7 0x2556 #BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0xb8 0x2555 #BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x255c #BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0xbe 0x255b #BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x255e #BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xc7 0x255f #BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x2567 #BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xd0 0x2568 #BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xd1 0x2564 #BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0xd2 0x2565 #BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0xd3 0x2559 #BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xd4 0x2558 #BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xd5 0x2552 #BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xd6 0x2553 #BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0xd7 0x256b #BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0xd8 0x256a #BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x258c #LEFT HALF BLOCK +0xde 0x2590 #RIGHT HALF BLOCK +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x03b1 #GREEK SMALL LETTER ALPHA +0xe1 0x00df #LATIN SMALL LETTER SHARP S +0xe2 0x0393 #GREEK CAPITAL LETTER GAMMA +0xe3 0x03c0 #GREEK SMALL LETTER PI +0xe4 0x03a3 #GREEK CAPITAL LETTER SIGMA +0xe5 0x03c3 #GREEK SMALL LETTER SIGMA +0xe6 0x00b5 #MICRO SIGN +0xe7 0x03c4 #GREEK SMALL LETTER TAU +0xe8 0x03a6 #GREEK CAPITAL LETTER PHI +0xe9 0x0398 #GREEK CAPITAL LETTER THETA +0xea 0x03a9 #GREEK CAPITAL LETTER OMEGA +0xeb 0x03b4 #GREEK SMALL LETTER DELTA +0xec 0x221e #INFINITY +0xed 0x03c6 #GREEK SMALL LETTER PHI +0xee 0x03b5 #GREEK SMALL LETTER EPSILON +0xef 0x2229 #INTERSECTION +0xf0 0x2261 #IDENTICAL TO +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 0x2265 #GREATER-THAN OR EQUAL TO +0xf3 0x2264 #LESS-THAN OR EQUAL TO +0xf4 0x2320 #TOP HALF INTEGRAL +0xf5 0x2321 #BOTTOM HALF INTEGRAL +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x2248 #ALMOST EQUAL TO +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x2219 #BULLET OPERATOR +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x221a #SQUARE ROOT +0xfc 0x207f #SUPERSCRIPT LATIN SMALL LETTER N +0xfd 0x00b2 #SUPERSCRIPT TWO +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp864.txt b/charsets/cp864.txt new file mode 100644 index 0000000..4cf3394 --- /dev/null +++ b/charsets/cp864.txt @@ -0,0 +1,275 @@ +# +# Name: cp864_DOSArabic to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp864_DOSArabic code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp864_DOSArabic order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x066a #ARABIC PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00b0 #DEGREE SIGN +0x81 0x00b7 #MIDDLE DOT +0x82 0x2219 #BULLET OPERATOR +0x83 0x221a #SQUARE ROOT +0x84 0x2592 #MEDIUM SHADE +0x85 0x2500 #FORMS LIGHT HORIZONTAL +0x86 0x2502 #FORMS LIGHT VERTICAL +0x87 0x253c #FORMS LIGHT VERTICAL AND HORIZONTAL +0x88 0x2524 #FORMS LIGHT VERTICAL AND LEFT +0x89 0x252c #FORMS LIGHT DOWN AND HORIZONTAL +0x8a 0x251c #FORMS LIGHT VERTICAL AND RIGHT +0x8b 0x2534 #FORMS LIGHT UP AND HORIZONTAL +0x8c 0x2510 #FORMS LIGHT DOWN AND LEFT +0x8d 0x250c #FORMS LIGHT DOWN AND RIGHT +0x8e 0x2514 #FORMS LIGHT UP AND RIGHT +0x8f 0x2518 #FORMS LIGHT UP AND LEFT +0x90 0x03b2 #GREEK SMALL BETA +0x91 0x221e #INFINITY +0x92 0x03c6 #GREEK SMALL PHI +0x93 0x00b1 #PLUS-OR-MINUS SIGN +0x94 0x00bd #FRACTION 1/2 +0x95 0x00bc #FRACTION 1/4 +0x96 0x2248 #ALMOST EQUAL TO +0x97 0x00ab #LEFT POINTING GUILLEMET +0x98 0x00bb #RIGHT POINTING GUILLEMET +0x99 0xfef7 #ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE ISOLATED FORM +0x9a 0xfef8 #ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE FINAL FORM +0x9b #UNDEFINED +0x9c #UNDEFINED +0x9d 0xfefb #ARABIC LIGATURE LAM WITH ALEF ISOLATED FORM +0x9e 0xfefc #ARABIC LIGATURE LAM WITH ALEF FINAL FORM +0x9f #UNDEFINED +0xa0 0x00a0 #NON-BREAKING SPACE +0xa1 0x00ad #SOFT HYPHEN +0xa2 0xfe82 #ARABIC LETTER ALEF WITH MADDA ABOVE FINAL FORM +0xa3 0x00a3 #POUND SIGN +0xa4 0x00a4 #CURRENCY SIGN +0xa5 0xfe84 #ARABIC LETTER ALEF WITH HAMZA ABOVE FINAL FORM +0xa6 #UNDEFINED +0xa7 #UNDEFINED +0xa8 0xfe8e #ARABIC LETTER ALEF FINAL FORM +0xa9 0xfe8f #ARABIC LETTER BEH ISOLATED FORM +0xaa 0xfe95 #ARABIC LETTER TEH ISOLATED FORM +0xab 0xfe99 #ARABIC LETTER THEH ISOLATED FORM +0xac 0x060c #ARABIC COMMA +0xad 0xfe9d #ARABIC LETTER JEEM ISOLATED FORM +0xae 0xfea1 #ARABIC LETTER HAH ISOLATED FORM +0xaf 0xfea5 #ARABIC LETTER KHAH ISOLATED FORM +0xb0 0x0660 #ARABIC-INDIC DIGIT ZERO +0xb1 0x0661 #ARABIC-INDIC DIGIT ONE +0xb2 0x0662 #ARABIC-INDIC DIGIT TWO +0xb3 0x0663 #ARABIC-INDIC DIGIT THREE +0xb4 0x0664 #ARABIC-INDIC DIGIT FOUR +0xb5 0x0665 #ARABIC-INDIC DIGIT FIVE +0xb6 0x0666 #ARABIC-INDIC DIGIT SIX +0xb7 0x0667 #ARABIC-INDIC DIGIT SEVEN +0xb8 0x0668 #ARABIC-INDIC DIGIT EIGHT +0xb9 0x0669 #ARABIC-INDIC DIGIT NINE +0xba 0xfed1 #ARABIC LETTER FEH ISOLATED FORM +0xbb 0x061b #ARABIC SEMICOLON +0xbc 0xfeb1 #ARABIC LETTER SEEN ISOLATED FORM +0xbd 0xfeb5 #ARABIC LETTER SHEEN ISOLATED FORM +0xbe 0xfeb9 #ARABIC LETTER SAD ISOLATED FORM +0xbf 0x061f #ARABIC QUESTION MARK +0xc0 0x00a2 #CENT SIGN +0xc1 0xfe80 #ARABIC LETTER HAMZA ISOLATED FORM +0xc2 0xfe81 #ARABIC LETTER ALEF WITH MADDA ABOVE ISOLATED FORM +0xc3 0xfe83 #ARABIC LETTER ALEF WITH HAMZA ABOVE ISOLATED FORM +0xc4 0xfe85 #ARABIC LETTER WAW WITH HAMZA ABOVE ISOLATED FORM +0xc5 0xfeca #ARABIC LETTER AIN FINAL FORM +0xc6 0xfe8b #ARABIC LETTER YEH WITH HAMZA ABOVE INITIAL FORM +0xc7 0xfe8d #ARABIC LETTER ALEF ISOLATED FORM +0xc8 0xfe91 #ARABIC LETTER BEH INITIAL FORM +0xc9 0xfe93 #ARABIC LETTER TEH MARBUTA ISOLATED FORM +0xca 0xfe97 #ARABIC LETTER TEH INITIAL FORM +0xcb 0xfe9b #ARABIC LETTER THEH INITIAL FORM +0xcc 0xfe9f #ARABIC LETTER JEEM INITIAL FORM +0xcd 0xfea3 #ARABIC LETTER HAH INITIAL FORM +0xce 0xfea7 #ARABIC LETTER KHAH INITIAL FORM +0xcf 0xfea9 #ARABIC LETTER DAL ISOLATED FORM +0xd0 0xfeab #ARABIC LETTER THAL ISOLATED FORM +0xd1 0xfead #ARABIC LETTER REH ISOLATED FORM +0xd2 0xfeaf #ARABIC LETTER ZAIN ISOLATED FORM +0xd3 0xfeb3 #ARABIC LETTER SEEN INITIAL FORM +0xd4 0xfeb7 #ARABIC LETTER SHEEN INITIAL FORM +0xd5 0xfebb #ARABIC LETTER SAD INITIAL FORM +0xd6 0xfebf #ARABIC LETTER DAD INITIAL FORM +0xd7 0xfec1 #ARABIC LETTER TAH ISOLATED FORM +0xd8 0xfec5 #ARABIC LETTER ZAH ISOLATED FORM +0xd9 0xfecb #ARABIC LETTER AIN INITIAL FORM +0xda 0xfecf #ARABIC LETTER GHAIN INITIAL FORM +0xdb 0x00a6 #BROKEN VERTICAL BAR +0xdc 0x00ac #NOT SIGN +0xdd 0x00f7 #DIVISION SIGN +0xde 0x00d7 #MULTIPLICATION SIGN +0xdf 0xfec9 #ARABIC LETTER AIN ISOLATED FORM +0xe0 0x0640 #ARABIC TATWEEL +0xe1 0xfed3 #ARABIC LETTER FEH INITIAL FORM +0xe2 0xfed7 #ARABIC LETTER QAF INITIAL FORM +0xe3 0xfedb #ARABIC LETTER KAF INITIAL FORM +0xe4 0xfedf #ARABIC LETTER LAM INITIAL FORM +0xe5 0xfee3 #ARABIC LETTER MEEM INITIAL FORM +0xe6 0xfee7 #ARABIC LETTER NOON INITIAL FORM +0xe7 0xfeeb #ARABIC LETTER HEH INITIAL FORM +0xe8 0xfeed #ARABIC LETTER WAW ISOLATED FORM +0xe9 0xfeef #ARABIC LETTER ALEF MAKSURA ISOLATED FORM +0xea 0xfef3 #ARABIC LETTER YEH INITIAL FORM +0xeb 0xfebd #ARABIC LETTER DAD ISOLATED FORM +0xec 0xfecc #ARABIC LETTER AIN MEDIAL FORM +0xed 0xfece #ARABIC LETTER GHAIN FINAL FORM +0xee 0xfecd #ARABIC LETTER GHAIN ISOLATED FORM +0xef 0xfee1 #ARABIC LETTER MEEM ISOLATED FORM +0xf0 0xfe7d #ARABIC SHADDA MEDIAL FORM +0xf1 0x0651 #ARABIC SHADDAH +0xf2 0xfee5 #ARABIC LETTER NOON ISOLATED FORM +0xf3 0xfee9 #ARABIC LETTER HEH ISOLATED FORM +0xf4 0xfeec #ARABIC LETTER HEH MEDIAL FORM +0xf5 0xfef0 #ARABIC LETTER ALEF MAKSURA FINAL FORM +0xf6 0xfef2 #ARABIC LETTER YEH FINAL FORM +0xf7 0xfed0 #ARABIC LETTER GHAIN MEDIAL FORM +0xf8 0xfed5 #ARABIC LETTER QAF ISOLATED FORM +0xf9 0xfef5 #ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE ISOLATED FORM +0xfa 0xfef6 #ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE FINAL FORM +0xfb 0xfedd #ARABIC LETTER LAM ISOLATED FORM +0xfc 0xfed9 #ARABIC LETTER KAF ISOLATED FORM +0xfd 0xfef1 #ARABIC LETTER YEH ISOLATED FORM +0xfe 0x25a0 #BLACK SQUARE +0xff #UNDEFINED + + \ No newline at end of file diff --git a/charsets/cp865.txt b/charsets/cp865.txt new file mode 100644 index 0000000..38f5ce0 --- /dev/null +++ b/charsets/cp865.txt @@ -0,0 +1,275 @@ +# +# Name: cp865_DOSNordic to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp865_DOSNordic code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp865_DOSNordic order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x00c7 #LATIN CAPITAL LETTER C WITH CEDILLA +0x81 0x00fc #LATIN SMALL LETTER U WITH DIAERESIS +0x82 0x00e9 #LATIN SMALL LETTER E WITH ACUTE +0x83 0x00e2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0x84 0x00e4 #LATIN SMALL LETTER A WITH DIAERESIS +0x85 0x00e0 #LATIN SMALL LETTER A WITH GRAVE +0x86 0x00e5 #LATIN SMALL LETTER A WITH RING ABOVE +0x87 0x00e7 #LATIN SMALL LETTER C WITH CEDILLA +0x88 0x00ea #LATIN SMALL LETTER E WITH CIRCUMFLEX +0x89 0x00eb #LATIN SMALL LETTER E WITH DIAERESIS +0x8a 0x00e8 #LATIN SMALL LETTER E WITH GRAVE +0x8b 0x00ef #LATIN SMALL LETTER I WITH DIAERESIS +0x8c 0x00ee #LATIN SMALL LETTER I WITH CIRCUMFLEX +0x8d 0x00ec #LATIN SMALL LETTER I WITH GRAVE +0x8e 0x00c4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0x8f 0x00c5 #LATIN CAPITAL LETTER A WITH RING ABOVE +0x90 0x00c9 #LATIN CAPITAL LETTER E WITH ACUTE +0x91 0x00e6 #LATIN SMALL LIGATURE AE +0x92 0x00c6 #LATIN CAPITAL LIGATURE AE +0x93 0x00f4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0x94 0x00f6 #LATIN SMALL LETTER O WITH DIAERESIS +0x95 0x00f2 #LATIN SMALL LETTER O WITH GRAVE +0x96 0x00fb #LATIN SMALL LETTER U WITH CIRCUMFLEX +0x97 0x00f9 #LATIN SMALL LETTER U WITH GRAVE +0x98 0x00ff #LATIN SMALL LETTER Y WITH DIAERESIS +0x99 0x00d6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0x9a 0x00dc #LATIN CAPITAL LETTER U WITH DIAERESIS +0x9b 0x00f8 #LATIN SMALL LETTER O WITH STROKE +0x9c 0x00a3 #POUND SIGN +0x9d 0x00d8 #LATIN CAPITAL LETTER O WITH STROKE +0x9e 0x20a7 #PESETA SIGN +0x9f 0x0192 #LATIN SMALL LETTER F WITH HOOK +0xa0 0x00e1 #LATIN SMALL LETTER A WITH ACUTE +0xa1 0x00ed #LATIN SMALL LETTER I WITH ACUTE +0xa2 0x00f3 #LATIN SMALL LETTER O WITH ACUTE +0xa3 0x00fa #LATIN SMALL LETTER U WITH ACUTE +0xa4 0x00f1 #LATIN SMALL LETTER N WITH TILDE +0xa5 0x00d1 #LATIN CAPITAL LETTER N WITH TILDE +0xa6 0x00aa #FEMININE ORDINAL INDICATOR +0xa7 0x00ba #MASCULINE ORDINAL INDICATOR +0xa8 0x00bf #INVERTED QUESTION MARK +0xa9 0x2310 #REVERSED NOT SIGN +0xaa 0x00ac #NOT SIGN +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x00bc #VULGAR FRACTION ONE QUARTER +0xad 0x00a1 #INVERTED EXCLAMATION MARK +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00a4 #CURRENCY SIGN +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x2561 #BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xb6 0x2562 #BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0xb7 0x2556 #BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0xb8 0x2555 #BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x255c #BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0xbe 0x255b #BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x255e #BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xc7 0x255f #BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x2567 #BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xd0 0x2568 #BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xd1 0x2564 #BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0xd2 0x2565 #BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0xd3 0x2559 #BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xd4 0x2558 #BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xd5 0x2552 #BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xd6 0x2553 #BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0xd7 0x256b #BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0xd8 0x256a #BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x258c #LEFT HALF BLOCK +0xde 0x2590 #RIGHT HALF BLOCK +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x03b1 #GREEK SMALL LETTER ALPHA +0xe1 0x00df #LATIN SMALL LETTER SHARP S +0xe2 0x0393 #GREEK CAPITAL LETTER GAMMA +0xe3 0x03c0 #GREEK SMALL LETTER PI +0xe4 0x03a3 #GREEK CAPITAL LETTER SIGMA +0xe5 0x03c3 #GREEK SMALL LETTER SIGMA +0xe6 0x00b5 #MICRO SIGN +0xe7 0x03c4 #GREEK SMALL LETTER TAU +0xe8 0x03a6 #GREEK CAPITAL LETTER PHI +0xe9 0x0398 #GREEK CAPITAL LETTER THETA +0xea 0x03a9 #GREEK CAPITAL LETTER OMEGA +0xeb 0x03b4 #GREEK SMALL LETTER DELTA +0xec 0x221e #INFINITY +0xed 0x03c6 #GREEK SMALL LETTER PHI +0xee 0x03b5 #GREEK SMALL LETTER EPSILON +0xef 0x2229 #INTERSECTION +0xf0 0x2261 #IDENTICAL TO +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 0x2265 #GREATER-THAN OR EQUAL TO +0xf3 0x2264 #LESS-THAN OR EQUAL TO +0xf4 0x2320 #TOP HALF INTEGRAL +0xf5 0x2321 #BOTTOM HALF INTEGRAL +0xf6 0x00f7 #DIVISION SIGN +0xf7 0x2248 #ALMOST EQUAL TO +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x2219 #BULLET OPERATOR +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x221a #SQUARE ROOT +0xfc 0x207f #SUPERSCRIPT LATIN SMALL LETTER N +0xfd 0x00b2 #SUPERSCRIPT TWO +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp866.txt b/charsets/cp866.txt new file mode 100644 index 0000000..25b831a --- /dev/null +++ b/charsets/cp866.txt @@ -0,0 +1,275 @@ +# +# Name: cp866_DOSCyrillicRussian to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp866_DOSCyrillicRussian code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp866_DOSCyrillicRussian order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 0x0410 #CYRILLIC CAPITAL LETTER A +0x81 0x0411 #CYRILLIC CAPITAL LETTER BE +0x82 0x0412 #CYRILLIC CAPITAL LETTER VE +0x83 0x0413 #CYRILLIC CAPITAL LETTER GHE +0x84 0x0414 #CYRILLIC CAPITAL LETTER DE +0x85 0x0415 #CYRILLIC CAPITAL LETTER IE +0x86 0x0416 #CYRILLIC CAPITAL LETTER ZHE +0x87 0x0417 #CYRILLIC CAPITAL LETTER ZE +0x88 0x0418 #CYRILLIC CAPITAL LETTER I +0x89 0x0419 #CYRILLIC CAPITAL LETTER SHORT I +0x8a 0x041a #CYRILLIC CAPITAL LETTER KA +0x8b 0x041b #CYRILLIC CAPITAL LETTER EL +0x8c 0x041c #CYRILLIC CAPITAL LETTER EM +0x8d 0x041d #CYRILLIC CAPITAL LETTER EN +0x8e 0x041e #CYRILLIC CAPITAL LETTER O +0x8f 0x041f #CYRILLIC CAPITAL LETTER PE +0x90 0x0420 #CYRILLIC CAPITAL LETTER ER +0x91 0x0421 #CYRILLIC CAPITAL LETTER ES +0x92 0x0422 #CYRILLIC CAPITAL LETTER TE +0x93 0x0423 #CYRILLIC CAPITAL LETTER U +0x94 0x0424 #CYRILLIC CAPITAL LETTER EF +0x95 0x0425 #CYRILLIC CAPITAL LETTER HA +0x96 0x0426 #CYRILLIC CAPITAL LETTER TSE +0x97 0x0427 #CYRILLIC CAPITAL LETTER CHE +0x98 0x0428 #CYRILLIC CAPITAL LETTER SHA +0x99 0x0429 #CYRILLIC CAPITAL LETTER SHCHA +0x9a 0x042a #CYRILLIC CAPITAL LETTER HARD SIGN +0x9b 0x042b #CYRILLIC CAPITAL LETTER YERU +0x9c 0x042c #CYRILLIC CAPITAL LETTER SOFT SIGN +0x9d 0x042d #CYRILLIC CAPITAL LETTER E +0x9e 0x042e #CYRILLIC CAPITAL LETTER YU +0x9f 0x042f #CYRILLIC CAPITAL LETTER YA +0xa0 0x0430 #CYRILLIC SMALL LETTER A +0xa1 0x0431 #CYRILLIC SMALL LETTER BE +0xa2 0x0432 #CYRILLIC SMALL LETTER VE +0xa3 0x0433 #CYRILLIC SMALL LETTER GHE +0xa4 0x0434 #CYRILLIC SMALL LETTER DE +0xa5 0x0435 #CYRILLIC SMALL LETTER IE +0xa6 0x0436 #CYRILLIC SMALL LETTER ZHE +0xa7 0x0437 #CYRILLIC SMALL LETTER ZE +0xa8 0x0438 #CYRILLIC SMALL LETTER I +0xa9 0x0439 #CYRILLIC SMALL LETTER SHORT I +0xaa 0x043a #CYRILLIC SMALL LETTER KA +0xab 0x043b #CYRILLIC SMALL LETTER EL +0xac 0x043c #CYRILLIC SMALL LETTER EM +0xad 0x043d #CYRILLIC SMALL LETTER EN +0xae 0x043e #CYRILLIC SMALL LETTER O +0xaf 0x043f #CYRILLIC SMALL LETTER PE +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x2561 #BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xb6 0x2562 #BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0xb7 0x2556 #BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0xb8 0x2555 #BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x255c #BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0xbe 0x255b #BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x255e #BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xc7 0x255f #BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x2567 #BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xd0 0x2568 #BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xd1 0x2564 #BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0xd2 0x2565 #BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0xd3 0x2559 #BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xd4 0x2558 #BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xd5 0x2552 #BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xd6 0x2553 #BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0xd7 0x256b #BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0xd8 0x256a #BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x258c #LEFT HALF BLOCK +0xde 0x2590 #RIGHT HALF BLOCK +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x0440 #CYRILLIC SMALL LETTER ER +0xe1 0x0441 #CYRILLIC SMALL LETTER ES +0xe2 0x0442 #CYRILLIC SMALL LETTER TE +0xe3 0x0443 #CYRILLIC SMALL LETTER U +0xe4 0x0444 #CYRILLIC SMALL LETTER EF +0xe5 0x0445 #CYRILLIC SMALL LETTER HA +0xe6 0x0446 #CYRILLIC SMALL LETTER TSE +0xe7 0x0447 #CYRILLIC SMALL LETTER CHE +0xe8 0x0448 #CYRILLIC SMALL LETTER SHA +0xe9 0x0449 #CYRILLIC SMALL LETTER SHCHA +0xea 0x044a #CYRILLIC SMALL LETTER HARD SIGN +0xeb 0x044b #CYRILLIC SMALL LETTER YERU +0xec 0x044c #CYRILLIC SMALL LETTER SOFT SIGN +0xed 0x044d #CYRILLIC SMALL LETTER E +0xee 0x044e #CYRILLIC SMALL LETTER YU +0xef 0x044f #CYRILLIC SMALL LETTER YA +0xf0 0x0401 #CYRILLIC CAPITAL LETTER IO +0xf1 0x0451 #CYRILLIC SMALL LETTER IO +0xf2 0x0404 #CYRILLIC CAPITAL LETTER UKRAINIAN IE +0xf3 0x0454 #CYRILLIC SMALL LETTER UKRAINIAN IE +0xf4 0x0407 #CYRILLIC CAPITAL LETTER YI +0xf5 0x0457 #CYRILLIC SMALL LETTER YI +0xf6 0x040e #CYRILLIC CAPITAL LETTER SHORT U +0xf7 0x045e #CYRILLIC SMALL LETTER SHORT U +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x2219 #BULLET OPERATOR +0xfa 0x00b7 #MIDDLE DOT +0xfb 0x221a #SQUARE ROOT +0xfc 0x2116 #NUMERO SIGN +0xfd 0x00a4 #CURRENCY SIGN +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp869.txt b/charsets/cp869.txt new file mode 100644 index 0000000..691ef97 --- /dev/null +++ b/charsets/cp869.txt @@ -0,0 +1,275 @@ +# +# Name: cp869_DOSGreek2 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/24/96 +# Authors: Lori Brownell +# K.D. Chang +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp869_DOSGreek2 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp869_DOSGreek2 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0a 0x000a #LINE FEED +0x0b 0x000b #VERTICAL TABULATION +0x0c 0x000c #FORM FEED +0x0d 0x000d #CARRIAGE RETURN +0x0e 0x000e #SHIFT OUT +0x0f 0x000f #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1a 0x001a #SUBSTITUTE +0x1b 0x001b #ESCAPE +0x1c 0x001c #FILE SEPARATOR +0x1d 0x001d #GROUP SEPARATOR +0x1e 0x001e #RECORD SEPARATOR +0x1f 0x001f #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2a 0x002a #ASTERISK +0x2b 0x002b #PLUS SIGN +0x2c 0x002c #COMMA +0x2d 0x002d #HYPHEN-MINUS +0x2e 0x002e #FULL STOP +0x2f 0x002f #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3a 0x003a #COLON +0x3b 0x003b #SEMICOLON +0x3c 0x003c #LESS-THAN SIGN +0x3d 0x003d #EQUALS SIGN +0x3e 0x003e #GREATER-THAN SIGN +0x3f 0x003f #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4a 0x004a #LATIN CAPITAL LETTER J +0x4b 0x004b #LATIN CAPITAL LETTER K +0x4c 0x004c #LATIN CAPITAL LETTER L +0x4d 0x004d #LATIN CAPITAL LETTER M +0x4e 0x004e #LATIN CAPITAL LETTER N +0x4f 0x004f #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5a 0x005a #LATIN CAPITAL LETTER Z +0x5b 0x005b #LEFT SQUARE BRACKET +0x5c 0x005c #REVERSE SOLIDUS +0x5d 0x005d #RIGHT SQUARE BRACKET +0x5e 0x005e #CIRCUMFLEX ACCENT +0x5f 0x005f #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6a 0x006a #LATIN SMALL LETTER J +0x6b 0x006b #LATIN SMALL LETTER K +0x6c 0x006c #LATIN SMALL LETTER L +0x6d 0x006d #LATIN SMALL LETTER M +0x6e 0x006e #LATIN SMALL LETTER N +0x6f 0x006f #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7a 0x007a #LATIN SMALL LETTER Z +0x7b 0x007b #LEFT CURLY BRACKET +0x7c 0x007c #VERTICAL LINE +0x7d 0x007d #RIGHT CURLY BRACKET +0x7e 0x007e #TILDE +0x7f 0x007f #DELETE +0x80 #UNDEFINED +0x81 #UNDEFINED +0x82 #UNDEFINED +0x83 #UNDEFINED +0x84 #UNDEFINED +0x85 #UNDEFINED +0x86 0x0386 #GREEK CAPITAL LETTER ALPHA WITH TONOS +0x87 #UNDEFINED +0x88 0x00b7 #MIDDLE DOT +0x89 0x00ac #NOT SIGN +0x8a 0x00a6 #BROKEN BAR +0x8b 0x2018 #LEFT SINGLE QUOTATION MARK +0x8c 0x2019 #RIGHT SINGLE QUOTATION MARK +0x8d 0x0388 #GREEK CAPITAL LETTER EPSILON WITH TONOS +0x8e 0x2015 #HORIZONTAL BAR +0x8f 0x0389 #GREEK CAPITAL LETTER ETA WITH TONOS +0x90 0x038a #GREEK CAPITAL LETTER IOTA WITH TONOS +0x91 0x03aa #GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +0x92 0x038c #GREEK CAPITAL LETTER OMICRON WITH TONOS +0x93 #UNDEFINED +0x94 #UNDEFINED +0x95 0x038e #GREEK CAPITAL LETTER UPSILON WITH TONOS +0x96 0x03ab #GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +0x97 0x00a9 #COPYRIGHT SIGN +0x98 0x038f #GREEK CAPITAL LETTER OMEGA WITH TONOS +0x99 0x00b2 #SUPERSCRIPT TWO +0x9a 0x00b3 #SUPERSCRIPT THREE +0x9b 0x03ac #GREEK SMALL LETTER ALPHA WITH TONOS +0x9c 0x00a3 #POUND SIGN +0x9d 0x03ad #GREEK SMALL LETTER EPSILON WITH TONOS +0x9e 0x03ae #GREEK SMALL LETTER ETA WITH TONOS +0x9f 0x03af #GREEK SMALL LETTER IOTA WITH TONOS +0xa0 0x03ca #GREEK SMALL LETTER IOTA WITH DIALYTIKA +0xa1 0x0390 #GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0xa2 0x03cc #GREEK SMALL LETTER OMICRON WITH TONOS +0xa3 0x03cd #GREEK SMALL LETTER UPSILON WITH TONOS +0xa4 0x0391 #GREEK CAPITAL LETTER ALPHA +0xa5 0x0392 #GREEK CAPITAL LETTER BETA +0xa6 0x0393 #GREEK CAPITAL LETTER GAMMA +0xa7 0x0394 #GREEK CAPITAL LETTER DELTA +0xa8 0x0395 #GREEK CAPITAL LETTER EPSILON +0xa9 0x0396 #GREEK CAPITAL LETTER ZETA +0xaa 0x0397 #GREEK CAPITAL LETTER ETA +0xab 0x00bd #VULGAR FRACTION ONE HALF +0xac 0x0398 #GREEK CAPITAL LETTER THETA +0xad 0x0399 #GREEK CAPITAL LETTER IOTA +0xae 0x00ab #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xaf 0x00bb #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xb0 0x2591 #LIGHT SHADE +0xb1 0x2592 #MEDIUM SHADE +0xb2 0x2593 #DARK SHADE +0xb3 0x2502 #BOX DRAWINGS LIGHT VERTICAL +0xb4 0x2524 #BOX DRAWINGS LIGHT VERTICAL AND LEFT +0xb5 0x039a #GREEK CAPITAL LETTER KAPPA +0xb6 0x039b #GREEK CAPITAL LETTER LAMDA +0xb7 0x039c #GREEK CAPITAL LETTER MU +0xb8 0x039d #GREEK CAPITAL LETTER NU +0xb9 0x2563 #BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xba 0x2551 #BOX DRAWINGS DOUBLE VERTICAL +0xbb 0x2557 #BOX DRAWINGS DOUBLE DOWN AND LEFT +0xbc 0x255d #BOX DRAWINGS DOUBLE UP AND LEFT +0xbd 0x039e #GREEK CAPITAL LETTER XI +0xbe 0x039f #GREEK CAPITAL LETTER OMICRON +0xbf 0x2510 #BOX DRAWINGS LIGHT DOWN AND LEFT +0xc0 0x2514 #BOX DRAWINGS LIGHT UP AND RIGHT +0xc1 0x2534 #BOX DRAWINGS LIGHT UP AND HORIZONTAL +0xc2 0x252c #BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0xc3 0x251c #BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0xc4 0x2500 #BOX DRAWINGS LIGHT HORIZONTAL +0xc5 0x253c #BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0xc6 0x03a0 #GREEK CAPITAL LETTER PI +0xc7 0x03a1 #GREEK CAPITAL LETTER RHO +0xc8 0x255a #BOX DRAWINGS DOUBLE UP AND RIGHT +0xc9 0x2554 #BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xca 0x2569 #BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xcb 0x2566 #BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xcc 0x2560 #BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xcd 0x2550 #BOX DRAWINGS DOUBLE HORIZONTAL +0xce 0x256c #BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xcf 0x03a3 #GREEK CAPITAL LETTER SIGMA +0xd0 0x03a4 #GREEK CAPITAL LETTER TAU +0xd1 0x03a5 #GREEK CAPITAL LETTER UPSILON +0xd2 0x03a6 #GREEK CAPITAL LETTER PHI +0xd3 0x03a7 #GREEK CAPITAL LETTER CHI +0xd4 0x03a8 #GREEK CAPITAL LETTER PSI +0xd5 0x03a9 #GREEK CAPITAL LETTER OMEGA +0xd6 0x03b1 #GREEK SMALL LETTER ALPHA +0xd7 0x03b2 #GREEK SMALL LETTER BETA +0xd8 0x03b3 #GREEK SMALL LETTER GAMMA +0xd9 0x2518 #BOX DRAWINGS LIGHT UP AND LEFT +0xda 0x250c #BOX DRAWINGS LIGHT DOWN AND RIGHT +0xdb 0x2588 #FULL BLOCK +0xdc 0x2584 #LOWER HALF BLOCK +0xdd 0x03b4 #GREEK SMALL LETTER DELTA +0xde 0x03b5 #GREEK SMALL LETTER EPSILON +0xdf 0x2580 #UPPER HALF BLOCK +0xe0 0x03b6 #GREEK SMALL LETTER ZETA +0xe1 0x03b7 #GREEK SMALL LETTER ETA +0xe2 0x03b8 #GREEK SMALL LETTER THETA +0xe3 0x03b9 #GREEK SMALL LETTER IOTA +0xe4 0x03ba #GREEK SMALL LETTER KAPPA +0xe5 0x03bb #GREEK SMALL LETTER LAMDA +0xe6 0x03bc #GREEK SMALL LETTER MU +0xe7 0x03bd #GREEK SMALL LETTER NU +0xe8 0x03be #GREEK SMALL LETTER XI +0xe9 0x03bf #GREEK SMALL LETTER OMICRON +0xea 0x03c0 #GREEK SMALL LETTER PI +0xeb 0x03c1 #GREEK SMALL LETTER RHO +0xec 0x03c3 #GREEK SMALL LETTER SIGMA +0xed 0x03c2 #GREEK SMALL LETTER FINAL SIGMA +0xee 0x03c4 #GREEK SMALL LETTER TAU +0xef 0x0384 #GREEK TONOS +0xf0 0x00ad #SOFT HYPHEN +0xf1 0x00b1 #PLUS-MINUS SIGN +0xf2 0x03c5 #GREEK SMALL LETTER UPSILON +0xf3 0x03c6 #GREEK SMALL LETTER PHI +0xf4 0x03c7 #GREEK SMALL LETTER CHI +0xf5 0x00a7 #SECTION SIGN +0xf6 0x03c8 #GREEK SMALL LETTER PSI +0xf7 0x0385 #GREEK DIALYTIKA TONOS +0xf8 0x00b0 #DEGREE SIGN +0xf9 0x00a8 #DIAERESIS +0xfa 0x03c9 #GREEK SMALL LETTER OMEGA +0xfb 0x03cb #GREEK SMALL LETTER UPSILON WITH DIALYTIKA +0xfc 0x03b0 #GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +0xfd 0x03ce #GREEK SMALL LETTER OMEGA WITH TONOS +0xfe 0x25a0 #BLACK SQUARE +0xff 0x00a0 #NO-BREAK SPACE + + \ No newline at end of file diff --git a/charsets/cp874.txt b/charsets/cp874.txt new file mode 100644 index 0000000..18eb2bc --- /dev/null +++ b/charsets/cp874.txt @@ -0,0 +1,274 @@ +# +# Name: cp874 to Unicode table +# Unicode version: 2.0 +# Table version: 2.00 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: cpxlate@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp874 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp874 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 #UNDEFINED +0x83 #UNDEFINED +0x84 #UNDEFINED +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 #UNDEFINED +0x87 #UNDEFINED +0x88 #UNDEFINED +0x89 #UNDEFINED +0x8A #UNDEFINED +0x8B #UNDEFINED +0x8C #UNDEFINED +0x8D #UNDEFINED +0x8E #UNDEFINED +0x8F #UNDEFINED +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 #UNDEFINED +0x99 #UNDEFINED +0x9A #UNDEFINED +0x9B #UNDEFINED +0x9C #UNDEFINED +0x9D #UNDEFINED +0x9E #UNDEFINED +0x9F #UNDEFINED +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x0E01 #THAI CHARACTER KO KAI +0xA2 0x0E02 #THAI CHARACTER KHO KHAI +0xA3 0x0E03 #THAI CHARACTER KHO KHUAT +0xA4 0x0E04 #THAI CHARACTER KHO KHWAI +0xA5 0x0E05 #THAI CHARACTER KHO KHON +0xA6 0x0E06 #THAI CHARACTER KHO RAKHANG +0xA7 0x0E07 #THAI CHARACTER NGO NGU +0xA8 0x0E08 #THAI CHARACTER CHO CHAN +0xA9 0x0E09 #THAI CHARACTER CHO CHING +0xAA 0x0E0A #THAI CHARACTER CHO CHANG +0xAB 0x0E0B #THAI CHARACTER SO SO +0xAC 0x0E0C #THAI CHARACTER CHO CHOE +0xAD 0x0E0D #THAI CHARACTER YO YING +0xAE 0x0E0E #THAI CHARACTER DO CHADA +0xAF 0x0E0F #THAI CHARACTER TO PATAK +0xB0 0x0E10 #THAI CHARACTER THO THAN +0xB1 0x0E11 #THAI CHARACTER THO NANGMONTHO +0xB2 0x0E12 #THAI CHARACTER THO PHUTHAO +0xB3 0x0E13 #THAI CHARACTER NO NEN +0xB4 0x0E14 #THAI CHARACTER DO DEK +0xB5 0x0E15 #THAI CHARACTER TO TAO +0xB6 0x0E16 #THAI CHARACTER THO THUNG +0xB7 0x0E17 #THAI CHARACTER THO THAHAN +0xB8 0x0E18 #THAI CHARACTER THO THONG +0xB9 0x0E19 #THAI CHARACTER NO NU +0xBA 0x0E1A #THAI CHARACTER BO BAIMAI +0xBB 0x0E1B #THAI CHARACTER PO PLA +0xBC 0x0E1C #THAI CHARACTER PHO PHUNG +0xBD 0x0E1D #THAI CHARACTER FO FA +0xBE 0x0E1E #THAI CHARACTER PHO PHAN +0xBF 0x0E1F #THAI CHARACTER FO FAN +0xC0 0x0E20 #THAI CHARACTER PHO SAMPHAO +0xC1 0x0E21 #THAI CHARACTER MO MA +0xC2 0x0E22 #THAI CHARACTER YO YAK +0xC3 0x0E23 #THAI CHARACTER RO RUA +0xC4 0x0E24 #THAI CHARACTER RU +0xC5 0x0E25 #THAI CHARACTER LO LING +0xC6 0x0E26 #THAI CHARACTER LU +0xC7 0x0E27 #THAI CHARACTER WO WAEN +0xC8 0x0E28 #THAI CHARACTER SO SALA +0xC9 0x0E29 #THAI CHARACTER SO RUSI +0xCA 0x0E2A #THAI CHARACTER SO SUA +0xCB 0x0E2B #THAI CHARACTER HO HIP +0xCC 0x0E2C #THAI CHARACTER LO CHULA +0xCD 0x0E2D #THAI CHARACTER O ANG +0xCE 0x0E2E #THAI CHARACTER HO NOKHUK +0xCF 0x0E2F #THAI CHARACTER PAIYANNOI +0xD0 0x0E30 #THAI CHARACTER SARA A +0xD1 0x0E31 #THAI CHARACTER MAI HAN-AKAT +0xD2 0x0E32 #THAI CHARACTER SARA AA +0xD3 0x0E33 #THAI CHARACTER SARA AM +0xD4 0x0E34 #THAI CHARACTER SARA I +0xD5 0x0E35 #THAI CHARACTER SARA II +0xD6 0x0E36 #THAI CHARACTER SARA UE +0xD7 0x0E37 #THAI CHARACTER SARA UEE +0xD8 0x0E38 #THAI CHARACTER SARA U +0xD9 0x0E39 #THAI CHARACTER SARA UU +0xDA 0x0E3A #THAI CHARACTER PHINTHU +0xDB #UNDEFINED +0xDC #UNDEFINED +0xDD #UNDEFINED +0xDE #UNDEFINED +0xDF 0x0E3F #THAI CURRENCY SYMBOL BAHT +0xE0 0x0E40 #THAI CHARACTER SARA E +0xE1 0x0E41 #THAI CHARACTER SARA AE +0xE2 0x0E42 #THAI CHARACTER SARA O +0xE3 0x0E43 #THAI CHARACTER SARA AI MAIMUAN +0xE4 0x0E44 #THAI CHARACTER SARA AI MAIMALAI +0xE5 0x0E45 #THAI CHARACTER LAKKHANGYAO +0xE6 0x0E46 #THAI CHARACTER MAIYAMOK +0xE7 0x0E47 #THAI CHARACTER MAITAIKHU +0xE8 0x0E48 #THAI CHARACTER MAI EK +0xE9 0x0E49 #THAI CHARACTER MAI THO +0xEA 0x0E4A #THAI CHARACTER MAI TRI +0xEB 0x0E4B #THAI CHARACTER MAI CHATTAWA +0xEC 0x0E4C #THAI CHARACTER THANTHAKHAT +0xED 0x0E4D #THAI CHARACTER NIKHAHIT +0xEE 0x0E4E #THAI CHARACTER YAMAKKAN +0xEF 0x0E4F #THAI CHARACTER FONGMAN +0xF0 0x0E50 #THAI DIGIT ZERO +0xF1 0x0E51 #THAI DIGIT ONE +0xF2 0x0E52 #THAI DIGIT TWO +0xF3 0x0E53 #THAI DIGIT THREE +0xF4 0x0E54 #THAI DIGIT FOUR +0xF5 0x0E55 #THAI DIGIT FIVE +0xF6 0x0E56 #THAI DIGIT SIX +0xF7 0x0E57 #THAI DIGIT SEVEN +0xF8 0x0E58 #THAI DIGIT EIGHT +0xF9 0x0E59 #THAI DIGIT NINE +0xFA 0x0E5A #THAI CHARACTER ANGKHANKHU +0xFB 0x0E5B #THAI CHARACTER KHOMUT +0xFC #UNDEFINED +0xFD #UNDEFINED +0xFE #UNDEFINED +0xFF #UNDEFINED diff --git a/charsets/koi8-r.txt b/charsets/koi8-r.txt new file mode 100644 index 0000000..94fffd8 --- /dev/null +++ b/charsets/koi8-r.txt @@ -0,0 +1,257 @@ +# Koi8 to unicode translation (from rfc1489) +0x00 0x0000 # NULL +0x01 0x0001 # START OF HEADING +0x02 0x0002 # START OF TEXT +0x03 0x0003 # END OF TEXT +0x04 0x0004 # END OF TRANSMISSION +0x05 0x0005 # ENQUIRY +0x06 0x0006 # ACKNOWLEDGE +0x07 0x0007 # BELL +0x08 0x0008 # BACKSPACE +0x09 0x0009 # HORIZONTAL TABULATION +0x0a 0x000a # LINE FEED +0x0b 0x000b # VERTICAL TABULATION +0x0c 0x000c # FORM FEED +0x0d 0x000d # CARRIAGE RETURN +0x0e 0x000e # SHIFT OUT +0x0f 0x000f # SHIFT IN +0x10 0x0010 # DATA LINK ESCAPE +0x11 0x0011 # DEVICE CONTROL ONE +0x12 0x0012 # DEVICE CONTROL TWO +0x13 0x0013 # DEVICE CONTROL THREE +0x14 0x0014 # DEVICE CONTROL FOUR +0x15 0x0015 # NEGATIVE ACKNOWLEDGE +0x16 0x0016 # SYNCHRONOUS IDLE +0x17 0x0017 # END OF TRANSMISSION BLOCK +0x18 0x0018 # CANCEL +0x19 0x0019 # END OF MEDIUM +0x1a 0x001a # SUBSTITUTE +0x1b 0x001b # ESCAPE +0x1c 0x001c # FILE SEPARATOR +0x1d 0x001d # GROUP SEPARATOR +0x1e 0x001e # RECORD SEPARATOR +0x1f 0x001f # UNIT SEPARATOR +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2a 0x002a # ASTERISK +0x2b 0x002b # PLUS SIGN +0x2c 0x002c # COMMA +0x2d 0x002d # HYPHEN-MINUS +0x2e 0x002e # FULL STOP +0x2f 0x002f # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3a 0x003a # COLON +0x3b 0x003b # SEMICOLON +0x3c 0x003c # LESS-THAN SIGN +0x3d 0x003d # EQUALS SIGN +0x3e 0x003e # GREATER-THAN SIGN +0x3f 0x003f # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4a 0x004a # LATIN CAPITAL LETTER J +0x4b 0x004b # LATIN CAPITAL LETTER K +0x4c 0x004c # LATIN CAPITAL LETTER L +0x4d 0x004d # LATIN CAPITAL LETTER M +0x4e 0x004e # LATIN CAPITAL LETTER N +0x4f 0x004f # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5a 0x005a # LATIN CAPITAL LETTER Z +0x5b 0x005b # LEFT SQUARE BRACKET +0x5c 0x005c # REVERSE SOLIDUS +0x5d 0x005d # RIGHT SQUARE BRACKET +0x5e 0x005e # CIRCUMFLEX ACCENT +0x5f 0x005f # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6a 0x006a # LATIN SMALL LETTER J +0x6b 0x006b # LATIN SMALL LETTER K +0x6c 0x006c # LATIN SMALL LETTER L +0x6d 0x006d # LATIN SMALL LETTER M +0x6e 0x006e # LATIN SMALL LETTER N +0x6f 0x006f # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7a 0x007a # LATIN SMALL LETTER Z +0x7b 0x007b # LEFT CURLY BRACKET +0x7c 0x007c # VERTICAL LINE +0x7d 0x007d # RIGHT CURLY BRACKET +0x7e 0x007e # TILDE +0x7f 0x007f # DELETE +0x80 0x2500 # BOX DRAWINGS LIGHT HORIZONTAL +0x81 0x2502 # BOX DRAWINGS LIGHT VERTICAL +0x82 0x250c # BOX DRAWINGS LIGHT DOWN AND RIGHT +0x83 0x2510 # BOX DRAWINGS LIGHT DOWN AND LEFT +0x84 0x2514 # BOX DRAWINGS LIGHT UP AND RIGHT +0x85 0x2518 # BOX DRAWINGS LIGHT UP AND LEFT +0x86 0x251c # BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0x87 0x2524 # BOX DRAWINGS LIGHT VERTICAL AND LEFT +0x88 0x252c # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0x89 0x2534 # BOX DRAWINGS LIGHT UP AND HORIZONTAL +0x8a 0x253c # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0x8b 0x2580 # UPPER HALF BLOCK +0x8c 0x2584 # LOWER HALF BLOCK +0x8d 0x2588 # FULL BLOCK +0x8e 0x258c # LEFT HALF BLOCK +0x8f 0x2590 # RIGHT HALF BLOCK +0x90 0x2591 # LIGHT SHADE +0x91 0x2592 # MEDIUM SHADE +0x92 0x2593 # DARK SHADE +0x93 0x2320 # UPPER HALF OF INTEGRAL +0x94 0x25a0 # BLACK SQUARE +0x95 0x2219 # BULLET OPERATOR +0x96 0x221a # SQUARE ROOT +0x97 0x2248 # ALMOST EQUAL TO +0x98 0x2264 # LESS-THAN OR EQUAL TO +0x99 0x2265 # GREATER-THAN OR EQUAL TO +0x9a 0x00a0 # NO-BREAK SPACE +0x9b 0x2321 # LOWER HALF OF INTEGRAL +0x9c 0x00b0 # DEGREE SIGN +0x9d 0x00b2 # SUPERSCRIPT TWO +0x9e 0x00b7 # MIDDLE DOT +0x9f 0x00f7 # DIVISION SIGN +0xa0 0x2550 # BOX DRAWINGS DOUBLE HORIZONTAL +0xa1 0x2551 # BOX DRAWINGS DOUBLE VERTICAL +0xa2 0x2552 # BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xa3 0x0451 # CYRILLIC SMALL LETTER IO +0xa4 0x2553 # BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE +0xa5 0x2554 # BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xa6 0x2555 # BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE +0xa7 0x2556 # BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE +0xa8 0x2557 # BOX DRAWINGS DOUBLE DOWN AND LEFT +0xa9 0x2558 # BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xaa 0x2559 # BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xab 0x255a # BOX DRAWINGS DOUBLE UP AND RIGHT +0xac 0x255b # BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xad 0x255c # BOX DRAWINGS UP DOUBLE AND LEFT SINGLE +0xae 0x255d # BOX DRAWINGS DOUBLE UP AND LEFT +0xaf 0x255e # BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xb0 0x255f # BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xb1 0x2560 # BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xb2 0x2561 # BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xb3 0x0401 # CYRILLIC CAPITAL LETTER IO +0xb4 0x2562 # BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE +0xb5 0x2563 # BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xb6 0x2564 # BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE +0xb7 0x2565 # BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE +0xb8 0x2566 # BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xb9 0x2567 # BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xba 0x2568 # BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xbb 0x2569 # BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xbc 0x256a # BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xbd 0x256b # BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE +0xbe 0x256c # BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xbf 0x00a9 # COPYRIGHT SIGN +0xc0 0x044e # CYRILLIC SMALL LETTER YU +0xc1 0x0430 # CYRILLIC SMALL LETTER A +0xc2 0x0431 # CYRILLIC SMALL LETTER BE +0xc3 0x0446 # CYRILLIC SMALL LETTER TSE +0xc4 0x0434 # CYRILLIC SMALL LETTER DE +0xc5 0x0435 # CYRILLIC SMALL LETTER IE +0xc6 0x0444 # CYRILLIC SMALL LETTER EF +0xc7 0x0433 # CYRILLIC SMALL LETTER GHE +0xc8 0x0445 # CYRILLIC SMALL LETTER HA +0xc9 0x0438 # CYRILLIC SMALL LETTER I +0xca 0x0439 # CYRILLIC SMALL LETTER SHORT I +0xcb 0x043a # CYRILLIC SMALL LETTER KA +0xcc 0x043b # CYRILLIC SMALL LETTER EL +0xcd 0x043c # CYRILLIC SMALL LETTER EM +0xce 0x043d # CYRILLIC SMALL LETTER EN +0xcf 0x043e # CYRILLIC SMALL LETTER O +0xd0 0x043f # CYRILLIC SMALL LETTER PE +0xd1 0x044f # CYRILLIC SMALL LETTER YA +0xd2 0x0440 # CYRILLIC SMALL LETTER ER +0xd3 0x0441 # CYRILLIC SMALL LETTER ES +0xd4 0x0442 # CYRILLIC SMALL LETTER TE +0xd5 0x0443 # CYRILLIC SMALL LETTER U +0xd6 0x0436 # CYRILLIC SMALL LETTER ZHE +0xd7 0x0432 # CYRILLIC SMALL LETTER VE +0xd8 0x044c # CYRILLIC SMALL LETTER SOFT SIGN +0xd9 0x044b # CYRILLIC SMALL LETTER YERU +0xda 0x0437 # CYRILLIC SMALL LETTER ZE +0xdb 0x0448 # CYRILLIC SMALL LETTER SHA +0xdc 0x044d # CYRILLIC SMALL LETTER E +0xdd 0x0449 # CYRILLIC SMALL LETTER SHCHA +0xde 0x0447 # CYRILLIC SMALL LETTER CHE +0xdf 0x044a # CYRILLIC SMALL LETTER HARD SIGN +0xe0 0x042e # CYRILLIC CAPITAL LETTER YU +0xe1 0x0410 # CYRILLIC CAPITAL LETTER A +0xe2 0x0411 # CYRILLIC CAPITAL LETTER BE +0xe3 0x0426 # CYRILLIC CAPITAL LETTER TSE +0xe4 0x0414 # CYRILLIC CAPITAL LETTER DE +0xe5 0x0415 # CYRILLIC CAPITAL LETTER IE +0xe6 0x0424 # CYRILLIC CAPITAL LETTER EF +0xe7 0x0413 # CYRILLIC CAPITAL LETTER GHE +0xe8 0x0425 # CYRILLIC CAPITAL LETTER HA +0xe9 0x0418 # CYRILLIC CAPITAL LETTER I +0xea 0x0419 # CYRILLIC CAPITAL LETTER SHORT I +0xeb 0x041a # CYRILLIC CAPITAL LETTER KA +0xec 0x041b # CYRILLIC CAPITAL LETTER EL +0xed 0x041c # CYRILLIC CAPITAL LETTER EM +0xee 0x041d # CYRILLIC CAPITAL LETTER EN +0xef 0x041e # CYRILLIC CAPITAL LETTER O +0xf0 0x041f # CYRILLIC CAPITAL LETTER PE +0xf1 0x042f # CYRILLIC CAPITAL LETTER YA +0xf2 0x0420 # CYRILLIC CAPITAL LETTER ER +0xf3 0x0421 # CYRILLIC CAPITAL LETTER ES +0xf4 0x0422 # CYRILLIC CAPITAL LETTER TE +0xf5 0x0423 # CYRILLIC CAPITAL LETTER U +0xf6 0x0416 # CYRILLIC CAPITAL LETTER ZHE +0xf7 0x0412 # CYRILLIC CAPITAL LETTER VE +0xf8 0x042c # CYRILLIC CAPITAL LETTER SOFT SIGN +0xf9 0x042b # CYRILLIC CAPITAL LETTER YERU +0xfa 0x0417 # CYRILLIC CAPITAL LETTER ZE +0xfb 0x0428 # CYRILLIC CAPITAL LETTER SHA +0xfc 0x042d # CYRILLIC CAPITAL LETTER E +0xfd 0x0429 # CYRILLIC CAPITAL LETTER SHCHA +0xfe 0x0427 # CYRILLIC CAPITAL LETTER CHE +0xff 0x042a # CYRILLIC CAPITAL LETTER HARD SIGN diff --git a/charsets/koi8-u.txt b/charsets/koi8-u.txt new file mode 100644 index 0000000..62f452b --- /dev/null +++ b/charsets/koi8-u.txt @@ -0,0 +1,224 @@ +# koi8-u to unicode translation from rfc 2319 +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0x80 0x2500 # BOX DRAWINGS LIGHT HORIZONTAL +0x81 0x2502 # BOX DRAWINGS LIGHT VERTICAL +0x82 0x250C # BOX DRAWINGS LIGHT DOWN AND RIGHT +0x83 0x2510 # BOX DRAWINGS LIGHT DOWN AND LEFT +0x84 0x2514 # BOX DRAWINGS LIGHT UP AND RIGHT +0x85 0x2518 # BOX DRAWINGS LIGHT UP AND LEFT +0x86 0x251C # BOX DRAWINGS LIGHT VERTICAL AND RIGHT +0x87 0x2524 # BOX DRAWINGS LIGHT VERTICAL AND LEFT +0x88 0x252C # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL +0x89 0x2534 # BOX DRAWINGS LIGHT UP AND HORIZONTAL +0x8A 0x253C # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL +0x8B 0x2580 # UPPER HALF BLOCK +0x8C 0x2584 # LOWER HALF BLOCK +0x8D 0x2588 # FULL BLOCK +0x8E 0x258C # LEFT HALF BLOCK +0x8F 0x2590 # RIGHT HALF BLOCK +0x90 0x2591 # LIGHT SHADE +0x91 0x2592 # MEDIUM SHADE +0x92 0x2593 # DARK SHADE +0x93 0x2320 # TOP HALF INTEGRAL +0x94 0x25A0 # BLACK SQUARE +0x95 0x2022 # BULLET +0x96 0x221A # SQUARE ROOT +0x97 0x2248 # ALMOST EQUAL TO +0x98 0x2264 # LESS-THAN OR EQUAL TO +0x99 0x2265 # GREATER-THAN OR EQUAL TO +0x9A 0x00A0 # NO-BREAK SPACE +0x9B 0x2321 # BOTTOM HALF INTEGRAL +0x9C 0x00B0 # DEGREE SIGN +0x9D 0x00B2 # SUPERSCRIPT TWO +0x9E 0x00B7 # MIDDLE DOT +0x9F 0x00F7 # DIVISION SIGN +0xA0 0x2550 # BOX DRAWINGS DOUBLE HORIZONTAL +0xA1 0x2551 # BOX DRAWINGS DOUBLE VERTICAL +0xA2 0x2552 # BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE +0xA3 0x0451 # CYRILLIC SMALL LETTER IO +0xA4 0x0454 # CYRILLIC SMALL LETTER UKRAINIAN IE +0xA5 0x2554 # BOX DRAWINGS DOUBLE DOWN AND RIGHT +0xA6 0x0456 # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +0xA7 0x0457 # CYRILLIC SMALL LETTER YI +0xA8 0x2557 # BOX DRAWINGS DOUBLE DOWN AND LEFT +0xA9 0x2558 # BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE +0xAA 0x2559 # BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE +0xAB 0x255A # BOX DRAWINGS DOUBLE UP AND RIGHT +0xAC 0x255B # BOX DRAWINGS UP SINGLE AND LEFT DOUBLE +0xAD 0x0491 # CYRILLIC SMALL LETTER GHE WITH UPTURN +0xAE 0x255D # BOX DRAWINGS DOUBLE UP AND LEFT +0xAF 0x255E # BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE +0xB0 0x255F # BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE +0xB1 0x2560 # BOX DRAWINGS DOUBLE VERTICAL AND RIGHT +0xB2 0x2561 # BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE +0xB3 0x0401 # CYRILLIC CAPITAL LETTER IO +0xB4 0x0404 # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0xB5 0x2563 # BOX DRAWINGS DOUBLE VERTICAL AND LEFT +0xB6 0x0406 # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0xB7 0x0407 # CYRILLIC CAPITAL LETTER YI +0xB8 0x2566 # BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL +0xB9 0x2567 # BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE +0xBA 0x2568 # BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE +0xBB 0x2569 # BOX DRAWINGS DOUBLE UP AND HORIZONTAL +0xBC 0x256A # BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE +0xBD 0x0490 # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0xBE 0x256C # BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL +0xBF 0x00A9 # COPYRIGHT SIGN +0xC0 0x044E # CYRILLIC SMALL LETTER YU +0xC1 0x0430 # CYRILLIC SMALL LETTER A +0xC2 0x0431 # CYRILLIC SMALL LETTER BE +0xC3 0x0446 # CYRILLIC SMALL LETTER TSE +0xC4 0x0434 # CYRILLIC SMALL LETTER DE +0xC5 0x0435 # CYRILLIC SMALL LETTER IE +0xC6 0x0444 # CYRILLIC SMALL LETTER EF +0xC7 0x0433 # CYRILLIC SMALL LETTER GHE +0xC8 0x0445 # CYRILLIC SMALL LETTER HA +0xC9 0x0438 # CYRILLIC SMALL LETTER I +0xCA 0x0439 # CYRILLIC SMALL LETTER SHORT I +0xCB 0x043A # CYRILLIC SMALL LETTER KA +0xCC 0x043B # CYRILLIC SMALL LETTER EL +0xCD 0x043C # CYRILLIC SMALL LETTER EM +0xCE 0x043D # CYRILLIC SMALL LETTER EN +0xCF 0x043E # CYRILLIC SMALL LETTER O +0xD0 0x043F # CYRILLIC SMALL LETTER PE +0xD1 0x044F # CYRILLIC SMALL LETTER YA +0xD2 0x0440 # CYRILLIC SMALL LETTER ER +0xD3 0x0441 # CYRILLIC SMALL LETTER ES +0xD4 0x0442 # CYRILLIC SMALL LETTER TE +0xD5 0x0443 # CYRILLIC SMALL LETTER U +0xD6 0x0436 # CYRILLIC SMALL LETTER ZHE +0xD7 0x0432 # CYRILLIC SMALL LETTER VE +0xD8 0x044C # CYRILLIC SMALL LETTER SOFT SIGN +0xD9 0x044B # CYRILLIC SMALL LETTER YERU +0xDA 0x0437 # CYRILLIC SMALL LETTER ZE +0xDB 0x0448 # CYRILLIC SMALL LETTER SHA +0xDC 0x044D # CYRILLIC SMALL LETTER E +0xDD 0x0449 # CYRILLIC SMALL LETTER SHCHA +0xDE 0x0447 # CYRILLIC SMALL LETTER CHE +0xDF 0x044A # CYRILLIC SMALL LETTER HARD SIGN +0xE0 0x042E # CYRILLIC CAPITAL LETTER YU +0xE1 0x0410 # CYRILLIC CAPITAL LETTER A +0xE2 0x0411 # CYRILLIC CAPITAL LETTER BE +0xE3 0x0426 # CYRILLIC CAPITAL LETTER TSE +0xE4 0x0414 # CYRILLIC CAPITAL LETTER DE +0xE5 0x0415 # CYRILLIC CAPITAL LETTER IE +0xE6 0x0424 # CYRILLIC CAPITAL LETTER EF +0xE7 0x0413 # CYRILLIC CAPITAL LETTER GHE +0xE8 0x0425 # CYRILLIC CAPITAL LETTER HA +0xE9 0x0418 # CYRILLIC CAPITAL LETTER I +0xEA 0x0419 # CYRILLIC CAPITAL LETTER SHORT I +0xEB 0x041A # CYRILLIC CAPITAL LETTER KA +0xEC 0x041B # CYRILLIC CAPITAL LETTER EL +0xED 0x041C # CYRILLIC CAPITAL LETTER EM +0xEE 0x041D # CYRILLIC CAPITAL LETTER EN +0xEF 0x041E # CYRILLIC CAPITAL LETTER O +0xF0 0x041F # CYRILLIC CAPITAL LETTER PE +0xF1 0x042F # CYRILLIC CAPITAL LETTER YA +0xF2 0x0420 # CYRILLIC CAPITAL LETTER ER +0xF3 0x0421 # CYRILLIC CAPITAL LETTER ES +0xF4 0x0422 # CYRILLIC CAPITAL LETTER TE +0xF5 0x0423 # CYRILLIC CAPITAL LETTER U +0xF6 0x0416 # CYRILLIC CAPITAL LETTER ZHE +0xF7 0x0412 # CYRILLIC CAPITAL LETTER VE +0xF8 0x042C # CYRILLIC CAPITAL LETTER SOFT SIGN +0xF9 0x042B # CYRILLIC CAPITAL LETTER YERU +0xFA 0x0417 # CYRILLIC CAPITAL LETTER ZE +0xFB 0x0428 # CYRILLIC CAPITAL LETTER SHA +0xFC 0x042D # CYRILLIC CAPITAL LETTER E +0xFD 0x0429 # CYRILLIC CAPITAL LETTER SHCHA +0xFE 0x0427 # CYRILLIC CAPITAL LETTER CHE +0xFF 0x042A # CYRILLIC CAPITAL LETTER HARD SIGN diff --git a/charsets/mac-arabic.txt b/charsets/mac-arabic.txt new file mode 100644 index 0000000..b5652a7 --- /dev/null +++ b/charsets/mac-arabic.txt @@ -0,0 +1,536 @@ +#======================================================================= +# File name: ARABIC.TXT +# +# Contents: Map (external version) from Mac OS Arabic +# character set to Unicode 2.1 and later. +# +# Copyright: (c) 1994-2002, 2005 by Apple Computer, Inc., all rights +# reserved. +# +# Contact: charsets@apple.com +# +# Changes: +# +# c02 2005-Apr-04 Update header comments. Matches internal xml +# and Text Encoding Converter 2.0. +# b3,c1 2002-Dec-19 Add comments about character display and +# direction overrides. Update URLs, notes. +# Matches internal utom. +# b02 1999-Sep-22 Update contact e-mail address. Matches +# internal utom, ufrm, and Text +# Encoding Converter version 1.5. +# n10 1998-Feb-05 Show required Unicode character +# directionality in a different way. Matches +# internal utom, ufrm, and Text +# Encoding Converter version 1.3. Update +# header comments; include information on +# loose mapping of digits. +# n07 1997-Jul-17 Update to match internal utom, ufrm: +# Change standard mapping for 0xC0 from U+066D +# to U+274A. Add direction overrides to +# mappings for 0x25, 0x2C, 0x3B, 0x3F. Add +# information on variants. +# n03 1995-Apr-18 First version (after fixing some typos). +# Matches internal ufrm. +# +# Standard header: +# ---------------- +# +# Apple, the Apple logo, and Macintosh are trademarks of Apple +# Computer, Inc., registered in the United States and other countries. +# Unicode is a trademark of Unicode Inc. For the sake of brevity, +# throughout this document, "Macintosh" can be used to refer to +# Macintosh computers and "Unicode" can be used to refer to the +# Unicode standard. +# +# Apple Computer, Inc. ("Apple") makes no warranty or representation, +# either express or implied, with respect to this document and the +# included data, its quality, accuracy, or fitness for a particular +# purpose. In no event will Apple be liable for direct, indirect, +# special, incidental, or consequential damages resulting from any +# defect or inaccuracy in this document or the included data. +# +# These mapping tables and character lists are subject to change. +# The latest tables should be available from the following: +# +# +# +# For general information about Mac OS encodings and these mapping +# tables, see the file "README.TXT". +# +# Format: +# ------- +# +# Three tab-separated columns; +# '#' begins a comment which continues to the end of the line. +# Column #1 is the Mac OS Arabic code (in hex as 0xNN). +# Column #2 is the corresponding Unicode (in hex as 0xNNNN), +# possibly preceded by a tag indicating required directionality +# (i.e. +0xNNNN or +0xNNNN). +# Column #3 is a comment containing the Unicode name. +# +# The entries are in Mac OS Arabic code order. +# +# Control character mappings are not shown in this table, following +# the conventions of the standard UTC mapping tables. However, the +# Mac OS Arabic character set uses the standard control characters at +# 0x00-0x1F and 0x7F. +# +# Notes on Mac OS Arabic: +# ----------------------- +# +# This is a legacy Mac OS encoding; in the Mac OS X Carbon and Cocoa +# environments, it is only supported via transcoding to and from +# Unicode. +# +# 1. General +# +# The Mac OS Arabic character set is intended to cover Arabic as +# used in North Africa, the Arabian peninsula, and the Levant. It +# also contains several characters needed for Urdu and/or Farsi. +# +# The Mac OS Arabic character set is essentially a superset of ISO +# 8859-6. The 8859-6 code points that are interpreted differently +# in the Mac OS Arabic set are as follows: +# 0xA0 is NO-BREAK SPACE in 8859-6 and right-left SPACE in Mac OS +# Arabic; NO-BREAK is 0x81 in Mac OS Arabic. +# 0xA4 is CURRENCY SIGN in 8859-6 and right-left DOLLAR SIGN in +# Mac OS Arabic. +# 0xAD is SOFT HYPHEN in 8859-6 and right-left HYPHEN-MINUS in +# Mac OS Arabic. +# ISO 8859-6 specifies that codes 0x30-0x39 can be rendered either +# with European digit shapes or Arabic digit shapes. This is also +# true in Mac OS Arabic, which determines from context which digit +# shapes to use (see below). +# +# The Mac OS Arabic character set uses the C1 controls area and other +# code points which are undefined in ISO 8859-6 for additional +# graphic characters: additional Arabic letters for Farsi and Urdu, +# some accented Roman letters for European languages (such as French), +# and duplicates of some of the punctuation, symbols, and digits in +# the ASCII block. The duplicate punctuation, symbol, and digit +# characters have right-left directionality, while the ASCII versions +# have left-right directionality. See the next section for more +# information on this. +# +# Mac OS Arabic characters 0xEB-0xF2 are non-spacing/combining marks. +# +# 2. Directional characters and roundtrip fidelity +# +# The Mac OS Arabic character set was developed in 1986-1987. At that +# time the bidirectional line layout algorithm used in the Mac OS +# Arabic system was fairly simple; it used only a few direction +# classes (instead of the 19 now used in the Unicode bidirectional +# algorithm). In order to permit users to handle some tricky layout +# problems, certain punctuation and symbol characters were encoded +# twice, one with a left-right direction attribute and the other with +# a right-left direction attribute. +# +# For example, plus sign is encoded at 0x2B with a left-right +# attribute, and at 0xAB with a right-left attribute. However, there +# is only one PLUS SIGN character in Unicode. This leads to some +# interesting problems when mapping between Mac OS Arabic and Unicode; +# see below. +# +# A related problem is that even when a particular character is +# encoded only once in Mac OS Arabic, it may have a different +# direction attribute than the corresponding Unicode character. +# +# For example, the Mac OS Arabic character at 0x93 is HORIZONTAL +# ELLIPSIS with strong right-left direction. However, the Unicode +# character HORIZONTAL ELLIPSIS has direction class neutral. +# +# 3. Behavior of ASCII-range numbers in WorldScript +# +# Mac OS Arabic also has two sets of digit codes. +# +# The digits at 0x30-0x39 may be displayed using either European +# digit forms or Arabic digit forms, depending on context. If there +# is a "strong European" character such as a Latin letter on either +# side of a sequence consisting of digits 0x30-0x39 and possibly comma +# 0x2C or period 0x2E, then the characters will be displayed using +# European forms (This will happen even if there are neutral characters +# between the digits and the strong European character). Otherwise, the +# digits will be displayed using Arabic forms, the comma will be +# displayed as Arabic thousands separator, and the period as Arabic +# decimal separator. In any case, 0x2C, 0x2E, and 0x30-0x39 are always +# left-right. +# +# The digits at 0xB0-0xB9 are always displayed using Arabic digit +# shapes, and moreover, these digits always have strong right-left +# directionality. These are mainly intended for special layout +# purposes such as part numbers, etc. +# +# 4. Font variants +# +# The table in this file gives the Unicode mappings for the standard +# Mac OS Arabic encoding. This encoding is supported by the Cairo font +# (the system font for Arabic), and is the encoding supported by the +# text processing utilities. However, the other Arabic fonts actually +# implement slightly different encodings; this mainly affects the code +# points 0xAA and 0xC0. For these code points the standard Mac OS +# Arabic encoding has the following mappings: +# 0xAA -> +0x002A ASTERISK, right-left +# 0xC0 -> +0x274A EIGHT TEARDROP-SPOKED PROPELLER ASTERISK, +# right-left +# This mapping of 0xAA is consistent with the normal convention for +# Mac OS Arabic and Hebrew that the right-left duplicates have codes +# that are equal to the ASCII code of the left-right character plus +# 0x80. However, in all of the other fonts, 0xAA is MULTIPLY SIGN, and +# right-left ASTERISK may be at a different code point. The other +# variants are described below. +# +# The TrueType variant is used for most of the Arabic TrueType fonts: +# Baghdad, Geeza, Kufi, Nadeem. It differs from the standard variant +# in the following way: +# 0xAA -> +0x00D7 MULTIPLICATION SIGN, right-left +# 0xC0 -> +0x002A ASTERISK, right-left +# +# The Thuluth variant is used for the Arabic Postscript-only fonts: +# Thuluth and Thuluth bold. It differs from the standard variant in +# the following way: +# 0xAA -> +0x00D7 MULTIPLICATION SIGN, right-left +# 0xC0 -> 0x066D ARABIC FIVE POINTED STAR +# +# The AlBayan variant is used for the Arabic TrueType font Al Bayan. +# It differs from the standard variant in the following way: +# 0x81 -> no mapping (glyph just has authorship information, etc.) +# 0xA3 -> 0xFDFA ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM +# 0xA4 -> 0xFDF2 ARABIC LIGATURE ALLAH ISOLATED FORM +# 0xAA -> +0x00D7 MULTIPLICATION SIGN, right-left +# 0xDC -> +0x25CF BLACK CIRCLE, right-left +# 0xFC -> +0x25A0 BLACK SQUARE, right-left +# +# Unicode mapping issues and notes: +# --------------------------------- +# +# 1. Matching the direction of Mac OS Arabic characters +# +# When Mac OS Arabic encodes a character twice but with different +# direction attributes for the two code points - as in the case of +# plus sign mentioned above - we need a way to map both Mac OS Arabic +# code points to Unicode and back again without loss of information. +# With the plus sign, for example, mapping one of the Mac OS Arabic +# characters to a code in the Unicode corporate use zone is +# undesirable, since both of the plus sign characters are likely to +# be used in text that is interchanged. +# +# The problem is solved with the use of direction override characters +# and direction-dependent mappings. When mapping from Mac OS Arabic +# to Unicode, we use direction overrides as necessary to force the +# direction of the resulting Unicode characters. +# +# The required direction is indicated by a direction tag in the +# mappings. A tag of means the corresponding Unicode character +# must have a strong left-right context, and a tag of indicates +# a right-left context. +# +# For example, the mapping of 0x2B is given as +0x002B; the +# mapping of 0xAB is given as +0x002B. If we map an isolated +# instance of 0x2B to Unicode, it should be mapped as follows (LRO +# indicates LEFT-RIGHT OVERRIDE, PDF indicates POP DIRECTION +# FORMATTING): +# +# 0x2B -> 0x202D (LRO) + 0x002B (PLUS SIGN) + 0x202C (PDF) +# +# When mapping several characters in a row that require direction +# forcing, the overrides need only be used at the beginning and end. +# For example: +# +# 0x24 0x20 0x28 0x29 -> 0x202D 0x0024 0x0020 0x0028 0x0029 0x202C +# +# If neutral characters that require direction forcing are already +# between strong-direction characters with matching directionality, +# then direction overrides need not be used. Direction overrides are +# always needed to map the right-left digits at 0xB0-0xB9. +# +# When mapping from Unicode to Mac OS Arabic, the Unicode +# bidirectional algorithm should be used to determine resolved +# direction of the Unicode characters. The mapping from Unicode to +# Mac OS Arabic can then be disambiguated by the use of the resolved +# direction: +# +# Unicode 0x002B -> Mac OS Arabic 0x2B (if L) or 0xAB (if R) +# +# However, this also means the direction override characters should +# be discarded when mapping from Unicode to Mac OS Arabic (after +# they have been used to determine resolved direction), since the +# direction override information is carried by the code point itself. +# +# Even when direction overrides are not needed for roundtrip +# fidelity, they are sometimes used when mapping Mac OS Arabic +# characters to Unicode in order to achieve similar text layout with +# the resulting Unicode text. For example, the single Mac OS Arabic +# ellipsis character has direction class right-left,and there is no +# left-right version. However, the Unicode HORIZONTAL ELLIPSIS +# character has direction class neutral (which means it may end up +# with a resolved direction of left-right if surrounded by left-right +# characters). When mapping the Mac OS Arabic ellipsis to Unicode, it +# is surrounded with a direction override to help preserve proper +# text layout. The resolved direction is not needed or used when +# mapping the Unicode HORIZONTAL ELLIPSIS back to Mac OS Arabic. +# +# 2. Mapping the Mac OS Arabic digits +# +# The main table below contains mappings that should be used when +# strict round-trip fidelity is required. However, for numeric +# values, the mappings in that table will produce Unicode characters +# that may appear different than the Mac OS Arabic text displayed on +# a Mac OS system using WorldScript. This is because WorldScript +# uses context-dependent display for the 0x30-0x39 digits. +# +# If roundtrip fidelity is not required, then the following +# alternate mappings should be used when a sequence of 0x30-0x39 +# digits - possibly including 0x2C and 0x2E - occurs in an Arabic +# context (that is, when the first "strong" character on either side +# of the digit sequence is Arabic, or there is no strong character): +# +# 0x2C 0x066C # ARABIC THOUSANDS SEPARATOR +# 0x2E 0x066B # ARABIC DECIMAL SEPARATOR +# 0x30 0x0660 # ARABIC-INDIC DIGIT ZERO +# 0x31 0x0661 # ARABIC-INDIC DIGIT ONE +# 0x32 0x0662 # ARABIC-INDIC DIGIT TWO +# 0x33 0x0663 # ARABIC-INDIC DIGIT THREE +# 0x34 0x0664 # ARABIC-INDIC DIGIT FOUR +# 0x35 0x0665 # ARABIC-INDIC DIGIT FIVE +# 0x36 0x0666 # ARABIC-INDIC DIGIT SIX +# 0x37 0x0667 # ARABIC-INDIC DIGIT SEVEN +# 0x38 0x0668 # ARABIC-INDIC DIGIT EIGHT +# 0x39 0x0669 # ARABIC-INDIC DIGIT NINE +# +# Details of mapping changes in each version: +# ------------------------------------------- +# +# Changes from version n03 to version n07: +# +# - Change mapping for 0xC0 from U+066D to U+274A. +# +# - Add direction overrides (required directionality) to mappings +# for 0x25, 0x2C, 0x3B, 0x3F. +# +################## + +0x20 +0x0020 # SPACE, left-right +0x21 +0x0021 # EXCLAMATION MARK, left-right +0x22 +0x0022 # QUOTATION MARK, left-right +0x23 +0x0023 # NUMBER SIGN, left-right +0x24 +0x0024 # DOLLAR SIGN, left-right +0x25 +0x0025 # PERCENT SIGN, left-right +0x26 +0x0026 # AMPERSAND, left-right +0x27 +0x0027 # APOSTROPHE, left-right +0x28 +0x0028 # LEFT PARENTHESIS, left-right +0x29 +0x0029 # RIGHT PARENTHESIS, left-right +0x2A +0x002A # ASTERISK, left-right +0x2B +0x002B # PLUS SIGN, left-right +0x2C +0x002C # COMMA, left-right; in Arabic-script context, displayed as 0x066C ARABIC THOUSANDS SEPARATOR +0x2D +0x002D # HYPHEN-MINUS, left-right +0x2E +0x002E # FULL STOP, left-right; in Arabic-script context, displayed as 0x066B ARABIC DECIMAL SEPARATOR +0x2F +0x002F # SOLIDUS, left-right +0x30 0x0030 # DIGIT ZERO; in Arabic-script context, displayed as 0x0660 ARABIC-INDIC DIGIT ZERO +0x31 0x0031 # DIGIT ONE; in Arabic-script context, displayed as 0x0661 ARABIC-INDIC DIGIT ONE +0x32 0x0032 # DIGIT TWO; in Arabic-script context, displayed as 0x0662 ARABIC-INDIC DIGIT TWO +0x33 0x0033 # DIGIT THREE; in Arabic-script context, displayed as 0x0663 ARABIC-INDIC DIGIT THREE +0x34 0x0034 # DIGIT FOUR; in Arabic-script context, displayed as 0x0664 ARABIC-INDIC DIGIT FOUR +0x35 0x0035 # DIGIT FIVE; in Arabic-script context, displayed as 0x0665 ARABIC-INDIC DIGIT FIVE +0x36 0x0036 # DIGIT SIX; in Arabic-script context, displayed as 0x0666 ARABIC-INDIC DIGIT SIX +0x37 0x0037 # DIGIT SEVEN; in Arabic-script context, displayed as 0x0667 ARABIC-INDIC DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT; in Arabic-script context, displayed as 0x0668 ARABIC-INDIC DIGIT EIGHT +0x39 0x0039 # DIGIT NINE; in Arabic-script context, displayed as 0x0669 ARABIC-INDIC DIGIT NINE +0x3A +0x003A # COLON, left-right +0x3B +0x003B # SEMICOLON, left-right +0x3C +0x003C # LESS-THAN SIGN, left-right +0x3D +0x003D # EQUALS SIGN, left-right +0x3E +0x003E # GREATER-THAN SIGN, left-right +0x3F +0x003F # QUESTION MARK, left-right +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B +0x005B # LEFT SQUARE BRACKET, left-right +0x5C +0x005C # REVERSE SOLIDUS, left-right +0x5D +0x005D # RIGHT SQUARE BRACKET, left-right +0x5E +0x005E # CIRCUMFLEX ACCENT, left-right +0x5F +0x005F # LOW LINE, left-right +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B +0x007B # LEFT CURLY BRACKET, left-right +0x7C +0x007C # VERTICAL LINE, left-right +0x7D +0x007D # RIGHT CURLY BRACKET, left-right +0x7E 0x007E # TILDE +# +0x80 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0x81 +0x00A0 # NO-BREAK SPACE, right-left +0x82 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0x83 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0x84 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0x85 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0x86 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0x87 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0x88 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0x89 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0x8A 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0x8B 0x06BA # ARABIC LETTER NOON GHUNNA +0x8C +0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK, right-left +0x8D 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0x8E 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0x8F 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0x90 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0x91 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0x92 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0x93 +0x2026 # HORIZONTAL ELLIPSIS, right-left +0x94 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0x95 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0x96 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0x97 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0x98 +0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK, right-left +0x99 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0x9A 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0x9B +0x00F7 # DIVISION SIGN, right-left +0x9C 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0x9D 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0x9E 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0x9F 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xA0 +0x0020 # SPACE, right-left +0xA1 +0x0021 # EXCLAMATION MARK, right-left +0xA2 +0x0022 # QUOTATION MARK, right-left +0xA3 +0x0023 # NUMBER SIGN, right-left +0xA4 +0x0024 # DOLLAR SIGN, right-left +0xA5 0x066A # ARABIC PERCENT SIGN +0xA6 +0x0026 # AMPERSAND, right-left +0xA7 +0x0027 # APOSTROPHE, right-left +0xA8 +0x0028 # LEFT PARENTHESIS, right-left +0xA9 +0x0029 # RIGHT PARENTHESIS, right-left +0xAA +0x002A # ASTERISK, right-left +0xAB +0x002B # PLUS SIGN, right-left +0xAC 0x060C # ARABIC COMMA +0xAD +0x002D # HYPHEN-MINUS, right-left +0xAE +0x002E # FULL STOP, right-left +0xAF +0x002F # SOLIDUS, right-left +0xB0 +0x0660 # ARABIC-INDIC DIGIT ZERO, right-left (need override) +0xB1 +0x0661 # ARABIC-INDIC DIGIT ONE, right-left (need override) +0xB2 +0x0662 # ARABIC-INDIC DIGIT TWO, right-left (need override) +0xB3 +0x0663 # ARABIC-INDIC DIGIT THREE, right-left (need override) +0xB4 +0x0664 # ARABIC-INDIC DIGIT FOUR, right-left (need override) +0xB5 +0x0665 # ARABIC-INDIC DIGIT FIVE, right-left (need override) +0xB6 +0x0666 # ARABIC-INDIC DIGIT SIX, right-left (need override) +0xB7 +0x0667 # ARABIC-INDIC DIGIT SEVEN, right-left (need override) +0xB8 +0x0668 # ARABIC-INDIC DIGIT EIGHT, right-left (need override) +0xB9 +0x0669 # ARABIC-INDIC DIGIT NINE, right-left (need override) +0xBA +0x003A # COLON, right-left +0xBB 0x061B # ARABIC SEMICOLON +0xBC +0x003C # LESS-THAN SIGN, right-left +0xBD +0x003D # EQUALS SIGN, right-left +0xBE +0x003E # GREATER-THAN SIGN, right-left +0xBF 0x061F # ARABIC QUESTION MARK +0xC0 +0x274A # EIGHT TEARDROP-SPOKED PROPELLER ASTERISK, right-left +0xC1 0x0621 # ARABIC LETTER HAMZA +0xC2 0x0622 # ARABIC LETTER ALEF WITH MADDA ABOVE +0xC3 0x0623 # ARABIC LETTER ALEF WITH HAMZA ABOVE +0xC4 0x0624 # ARABIC LETTER WAW WITH HAMZA ABOVE +0xC5 0x0625 # ARABIC LETTER ALEF WITH HAMZA BELOW +0xC6 0x0626 # ARABIC LETTER YEH WITH HAMZA ABOVE +0xC7 0x0627 # ARABIC LETTER ALEF +0xC8 0x0628 # ARABIC LETTER BEH +0xC9 0x0629 # ARABIC LETTER TEH MARBUTA +0xCA 0x062A # ARABIC LETTER TEH +0xCB 0x062B # ARABIC LETTER THEH +0xCC 0x062C # ARABIC LETTER JEEM +0xCD 0x062D # ARABIC LETTER HAH +0xCE 0x062E # ARABIC LETTER KHAH +0xCF 0x062F # ARABIC LETTER DAL +0xD0 0x0630 # ARABIC LETTER THAL +0xD1 0x0631 # ARABIC LETTER REH +0xD2 0x0632 # ARABIC LETTER ZAIN +0xD3 0x0633 # ARABIC LETTER SEEN +0xD4 0x0634 # ARABIC LETTER SHEEN +0xD5 0x0635 # ARABIC LETTER SAD +0xD6 0x0636 # ARABIC LETTER DAD +0xD7 0x0637 # ARABIC LETTER TAH +0xD8 0x0638 # ARABIC LETTER ZAH +0xD9 0x0639 # ARABIC LETTER AIN +0xDA 0x063A # ARABIC LETTER GHAIN +0xDB +0x005B # LEFT SQUARE BRACKET, right-left +0xDC +0x005C # REVERSE SOLIDUS, right-left +0xDD +0x005D # RIGHT SQUARE BRACKET, right-left +0xDE +0x005E # CIRCUMFLEX ACCENT, right-left +0xDF +0x005F # LOW LINE, right-left +0xE0 0x0640 # ARABIC TATWEEL +0xE1 0x0641 # ARABIC LETTER FEH +0xE2 0x0642 # ARABIC LETTER QAF +0xE3 0x0643 # ARABIC LETTER KAF +0xE4 0x0644 # ARABIC LETTER LAM +0xE5 0x0645 # ARABIC LETTER MEEM +0xE6 0x0646 # ARABIC LETTER NOON +0xE7 0x0647 # ARABIC LETTER HEH +0xE8 0x0648 # ARABIC LETTER WAW +0xE9 0x0649 # ARABIC LETTER ALEF MAKSURA +0xEA 0x064A # ARABIC LETTER YEH +0xEB 0x064B # ARABIC FATHATAN +0xEC 0x064C # ARABIC DAMMATAN +0xED 0x064D # ARABIC KASRATAN +0xEE 0x064E # ARABIC FATHA +0xEF 0x064F # ARABIC DAMMA +0xF0 0x0650 # ARABIC KASRA +0xF1 0x0651 # ARABIC SHADDA +0xF2 0x0652 # ARABIC SUKUN +0xF3 0x067E # ARABIC LETTER PEH +0xF4 0x0679 # ARABIC LETTER TTEH +0xF5 0x0686 # ARABIC LETTER TCHEH +0xF6 0x06D5 # ARABIC LETTER AE +0xF7 0x06A4 # ARABIC LETTER VEH +0xF8 0x06AF # ARABIC LETTER GAF +0xF9 0x0688 # ARABIC LETTER DDAL +0xFA 0x0691 # ARABIC LETTER RREH +0xFB +0x007B # LEFT CURLY BRACKET, right-left +0xFC +0x007C # VERTICAL LINE, right-left +0xFD +0x007D # RIGHT CURLY BRACKET, right-left +0xFE 0x0698 # ARABIC LETTER JEH +0xFF 0x06D2 # ARABIC LETTER YEH BARREE diff --git a/charsets/mac-centeuro.txt b/charsets/mac-centeuro.txt new file mode 100644 index 0000000..541e559 --- /dev/null +++ b/charsets/mac-centeuro.txt @@ -0,0 +1,327 @@ +#======================================================================= +# File name: CENTEURO.TXT +# +# Contents: Map (external version) from Mac OS Central European +# character set to Unicode 2.1 and later. +# +# Copyright: (c) 1995-2002, 2005 by Apple Computer, Inc., all rights +# reserved. +# +# Contact: charsets@apple.com +# +# Changes: +# +# c02 2005-Apr-04 Update header comments. Matches internal xml +# and Text Encoding Converter 2.0. +# b3,c1 2002-Dec-19 Update URLs. Matches internal utom. +# b02 1999-Sep-22 Update contact e-mail address. Matches +# internal utom, ufrm, and Text +# Encoding Converter version 1.5. +# n05 1998-Feb-05 Update header comments to new format; no +# mapping changes. Matches internal utom, +# ufrm, and Text Encoding Converter +# version 1.3. +# n03 1995-Apr-15 First version (after fixing some typos). +# Matches internal ufrm. +# +# Standard header: +# ---------------- +# +# Apple, the Apple logo, and Macintosh are trademarks of Apple +# Computer, Inc., registered in the United States and other countries. +# Unicode is a trademark of Unicode Inc. For the sake of brevity, +# throughout this document, "Macintosh" can be used to refer to +# Macintosh computers and "Unicode" can be used to refer to the +# Unicode standard. +# +# Apple Computer, Inc. ("Apple") makes no warranty or representation, +# either express or implied, with respect to this document and the +# included data, its quality, accuracy, or fitness for a particular +# purpose. In no event will Apple be liable for direct, indirect, +# special, incidental, or consequential damages resulting from any +# defect or inaccuracy in this document or the included data. +# +# These mapping tables and character lists are subject to change. +# The latest tables should be available from the following: +# +# +# +# For general information about Mac OS encodings and these mapping +# tables, see the file "README.TXT". +# +# Format: +# ------- +# +# Three tab-separated columns; +# '#' begins a comment which continues to the end of the line. +# Column #1 is the Mac OS Central European code (in hex as 0xNN) +# Column #2 is the corresponding Unicode (in hex as 0xNNNN) +# Column #3 is a comment containing the Unicode name +# +# The entries are in Mac OS Central European code order. +# +# Control character mappings are not shown in this table, following +# the conventions of the standard UTC mapping tables. However, the +# Mac OS Central European character set uses the standard control +# characters at 0x00-0x1F and 0x7F. +# +# Notes on Mac OS Central European: +# --------------------------------- +# +# This is a legacy Mac OS encoding; in the Mac OS X Carbon and Cocoa +# environments, it is only supported directly in programming +# interfaces for QuickDraw Text, the Script Manager, and related +# Text Utilities. For other purposes it is supported via transcoding +# to and from Unicode. +# +# This character set is intended to cover the following languages: +# +# Polish, Czech, Slovak, Hungarian, Estonian, Latvian, Lithuanian +# +# These are written in Latin script, but using a different set of +# of accented characters than Mac OS Roman. The Mac OS Central +# European character set also includes a number of characters +# needed for the Mac OS user interface and localization (e.g. +# ellipsis, bullet, copyright sign), several typographic +# punctuation symbols, math symbols, etc. However, it has a +# smaller set of punctuation and symbols than Mac OS Roman. All of +# the characters in Mac OS Central European that are also in the +# Mac OS Roman character set are at the same code point in both +# character sets; this improves application compatibility. +# +# Note: This does not have the same letter repertoire as ISO +# 8859-2 (Latin-2); each has some accented letters that the other +# does not have. +# +# Unicode mapping issues and notes: +# --------------------------------- +# +# Details of mapping changes in each version: +# ------------------------------------------- +# +################## + +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +# +0x80 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0x81 0x0100 # LATIN CAPITAL LETTER A WITH MACRON +0x82 0x0101 # LATIN SMALL LETTER A WITH MACRON +0x83 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0x84 0x0104 # LATIN CAPITAL LETTER A WITH OGONEK +0x85 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0x86 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0x87 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0x88 0x0105 # LATIN SMALL LETTER A WITH OGONEK +0x89 0x010C # LATIN CAPITAL LETTER C WITH CARON +0x8A 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0x8B 0x010D # LATIN SMALL LETTER C WITH CARON +0x8C 0x0106 # LATIN CAPITAL LETTER C WITH ACUTE +0x8D 0x0107 # LATIN SMALL LETTER C WITH ACUTE +0x8E 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0x8F 0x0179 # LATIN CAPITAL LETTER Z WITH ACUTE +0x90 0x017A # LATIN SMALL LETTER Z WITH ACUTE +0x91 0x010E # LATIN CAPITAL LETTER D WITH CARON +0x92 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0x93 0x010F # LATIN SMALL LETTER D WITH CARON +0x94 0x0112 # LATIN CAPITAL LETTER E WITH MACRON +0x95 0x0113 # LATIN SMALL LETTER E WITH MACRON +0x96 0x0116 # LATIN CAPITAL LETTER E WITH DOT ABOVE +0x97 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0x98 0x0117 # LATIN SMALL LETTER E WITH DOT ABOVE +0x99 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0x9A 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0x9B 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0x9C 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0x9D 0x011A # LATIN CAPITAL LETTER E WITH CARON +0x9E 0x011B # LATIN SMALL LETTER E WITH CARON +0x9F 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xA0 0x2020 # DAGGER +0xA1 0x00B0 # DEGREE SIGN +0xA2 0x0118 # LATIN CAPITAL LETTER E WITH OGONEK +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A7 # SECTION SIGN +0xA5 0x2022 # BULLET +0xA6 0x00B6 # PILCROW SIGN +0xA7 0x00DF # LATIN SMALL LETTER SHARP S +0xA8 0x00AE # REGISTERED SIGN +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x2122 # TRADE MARK SIGN +0xAB 0x0119 # LATIN SMALL LETTER E WITH OGONEK +0xAC 0x00A8 # DIAERESIS +0xAD 0x2260 # NOT EQUAL TO +0xAE 0x0123 # LATIN SMALL LETTER G WITH CEDILLA +0xAF 0x012E # LATIN CAPITAL LETTER I WITH OGONEK +0xB0 0x012F # LATIN SMALL LETTER I WITH OGONEK +0xB1 0x012A # LATIN CAPITAL LETTER I WITH MACRON +0xB2 0x2264 # LESS-THAN OR EQUAL TO +0xB3 0x2265 # GREATER-THAN OR EQUAL TO +0xB4 0x012B # LATIN SMALL LETTER I WITH MACRON +0xB5 0x0136 # LATIN CAPITAL LETTER K WITH CEDILLA +0xB6 0x2202 # PARTIAL DIFFERENTIAL +0xB7 0x2211 # N-ARY SUMMATION +0xB8 0x0142 # LATIN SMALL LETTER L WITH STROKE +0xB9 0x013B # LATIN CAPITAL LETTER L WITH CEDILLA +0xBA 0x013C # LATIN SMALL LETTER L WITH CEDILLA +0xBB 0x013D # LATIN CAPITAL LETTER L WITH CARON +0xBC 0x013E # LATIN SMALL LETTER L WITH CARON +0xBD 0x0139 # LATIN CAPITAL LETTER L WITH ACUTE +0xBE 0x013A # LATIN SMALL LETTER L WITH ACUTE +0xBF 0x0145 # LATIN CAPITAL LETTER N WITH CEDILLA +0xC0 0x0146 # LATIN SMALL LETTER N WITH CEDILLA +0xC1 0x0143 # LATIN CAPITAL LETTER N WITH ACUTE +0xC2 0x00AC # NOT SIGN +0xC3 0x221A # SQUARE ROOT +0xC4 0x0144 # LATIN SMALL LETTER N WITH ACUTE +0xC5 0x0147 # LATIN CAPITAL LETTER N WITH CARON +0xC6 0x2206 # INCREMENT +0xC7 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xC8 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xC9 0x2026 # HORIZONTAL ELLIPSIS +0xCA 0x00A0 # NO-BREAK SPACE +0xCB 0x0148 # LATIN SMALL LETTER N WITH CARON +0xCC 0x0150 # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0xCD 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xCE 0x0151 # LATIN SMALL LETTER O WITH DOUBLE ACUTE +0xCF 0x014C # LATIN CAPITAL LETTER O WITH MACRON +0xD0 0x2013 # EN DASH +0xD1 0x2014 # EM DASH +0xD2 0x201C # LEFT DOUBLE QUOTATION MARK +0xD3 0x201D # RIGHT DOUBLE QUOTATION MARK +0xD4 0x2018 # LEFT SINGLE QUOTATION MARK +0xD5 0x2019 # RIGHT SINGLE QUOTATION MARK +0xD6 0x00F7 # DIVISION SIGN +0xD7 0x25CA # LOZENGE +0xD8 0x014D # LATIN SMALL LETTER O WITH MACRON +0xD9 0x0154 # LATIN CAPITAL LETTER R WITH ACUTE +0xDA 0x0155 # LATIN SMALL LETTER R WITH ACUTE +0xDB 0x0158 # LATIN CAPITAL LETTER R WITH CARON +0xDC 0x2039 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0xDD 0x203A # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0xDE 0x0159 # LATIN SMALL LETTER R WITH CARON +0xDF 0x0156 # LATIN CAPITAL LETTER R WITH CEDILLA +0xE0 0x0157 # LATIN SMALL LETTER R WITH CEDILLA +0xE1 0x0160 # LATIN CAPITAL LETTER S WITH CARON +0xE2 0x201A # SINGLE LOW-9 QUOTATION MARK +0xE3 0x201E # DOUBLE LOW-9 QUOTATION MARK +0xE4 0x0161 # LATIN SMALL LETTER S WITH CARON +0xE5 0x015A # LATIN CAPITAL LETTER S WITH ACUTE +0xE6 0x015B # LATIN SMALL LETTER S WITH ACUTE +0xE7 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xE8 0x0164 # LATIN CAPITAL LETTER T WITH CARON +0xE9 0x0165 # LATIN SMALL LETTER T WITH CARON +0xEA 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xEB 0x017D # LATIN CAPITAL LETTER Z WITH CARON +0xEC 0x017E # LATIN SMALL LETTER Z WITH CARON +0xED 0x016A # LATIN CAPITAL LETTER U WITH MACRON +0xEE 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xEF 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xF0 0x016B # LATIN SMALL LETTER U WITH MACRON +0xF1 0x016E # LATIN CAPITAL LETTER U WITH RING ABOVE +0xF2 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xF3 0x016F # LATIN SMALL LETTER U WITH RING ABOVE +0xF4 0x0170 # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0xF5 0x0171 # LATIN SMALL LETTER U WITH DOUBLE ACUTE +0xF6 0x0172 # LATIN CAPITAL LETTER U WITH OGONEK +0xF7 0x0173 # LATIN SMALL LETTER U WITH OGONEK +0xF8 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE +0xF9 0x00FD # LATIN SMALL LETTER Y WITH ACUTE +0xFA 0x0137 # LATIN SMALL LETTER K WITH CEDILLA +0xFB 0x017B # LATIN CAPITAL LETTER Z WITH DOT ABOVE +0xFC 0x0141 # LATIN CAPITAL LETTER L WITH STROKE +0xFD 0x017C # LATIN SMALL LETTER Z WITH DOT ABOVE +0xFE 0x0122 # LATIN CAPITAL LETTER G WITH CEDILLA +0xFF 0x02C7 # CARON diff --git a/charsets/mac-cyrillic.txt b/charsets/mac-cyrillic.txt new file mode 100644 index 0000000..22573a2 --- /dev/null +++ b/charsets/mac-cyrillic.txt @@ -0,0 +1,347 @@ +#======================================================================= +# FTP file name: CYRILLIC.TXT +# +# Contents: Map (external version) from Mac OS Cyrillic +# character set to Unicode 2.0 +# +# Copyright: (c) 1995-1998 by Apple Computer, Inc., all rights +# reserved. +# +# Contacts: Peter Edberg +# Julio Gonzalez +# John Jenkins +# +# Changes: +# +# n05 1998-Feb-05 Update header comments to new format; no +# mapping changes. Matches internal utom<3>, +# ufrm<13>, and Text Encoding Converter +# version 1.3. +# n03 1995-Apr-15 First version (after fixing some typos). +# Matches internal ufrm<5>. +# +# Standard header: +# ---------------- +# +# Apple, the Apple logo, and Macintosh are trademarks of Apple +# Computer, Inc., registered in the United States and other countries. +# Unicode is a trademark of Unicode Inc. For the sake of brevity, +# throughout this document, "Macintosh" can be used to refer to +# Macintosh computers and "Unicode" can be used to refer to the +# Unicode standard. +# +# Apple makes no warranty or representation, either express or +# implied, with respect to these tables, their quality, accuracy, or +# fitness for a particular purpose. In no event will Apple be liable +# for direct, indirect, special, incidental, or consequential damages +# resulting from any defect or inaccuracy in this document or the +# accompanying tables. +# +# These mapping tables and character lists are subject to change. +# The latest tables should be available from the following: +# +# +# +# +# For general information about Mac OS encodings and these mapping +# tables, see the file "README.TXT". +# +# Format: +# ------- +# +# Three tab-separated columns; +# '#' begins a comment which continues to the end of the line. +# Column #1 is the Mac OS Cyrillic code (in hex as 0xNN) +# Column #2 is the corresponding Unicode (in hex as 0xNNNN) +# Column #3 is a comment containing the Unicode name +# +# The entries are in Mac OS Cyrillic code order. +# +# Control character mappings are not shown in this table, following +# the conventions of the standard UTC mapping tables. However, the +# Mac OS Cyrillic character set uses the standard control characters +# at 0x00-0x1F and 0x7F. +# +# Notes on Mac OS Cyrillic: +# ------------------------- +# +# The Mac Cyrillic encoding is used for most Cyrillic localized +# system software except Ukrainian. A variant of Mac OS Cyrillic +# is used for Ukrainian and for the Cyrillic Language Kit; it is +# covered by a separate table. +# +# The Mac OS Cyrillic encoding has the same Cyrillic letter +# repertoire as ISO 8859-5 (although not at the same code points). +# This covers most of the Slavic languages written in Cyrillic +# script. +# +# The Mac OS Cyrillic encoding also includes a number of characters +# needed for the Mac OS user interface and localization (e.g. +# ellipsis, bullet, copyright sign). All of the characters in Mac OS +# Cyrillic that are also in the Mac OS Roman encoding are at the +# same code point in both; this improves application compatibility. +# +# Unicode mapping issues and notes: +# --------------------------------- +# +# Details of mapping changes in each version: +# ------------------------------------------- +# +################## + +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +# +0x80 0x0410 # CYRILLIC CAPITAL LETTER A +0x81 0x0411 # CYRILLIC CAPITAL LETTER BE +0x82 0x0412 # CYRILLIC CAPITAL LETTER VE +0x83 0x0413 # CYRILLIC CAPITAL LETTER GHE +0x84 0x0414 # CYRILLIC CAPITAL LETTER DE +0x85 0x0415 # CYRILLIC CAPITAL LETTER IE +0x86 0x0416 # CYRILLIC CAPITAL LETTER ZHE +0x87 0x0417 # CYRILLIC CAPITAL LETTER ZE +0x88 0x0418 # CYRILLIC CAPITAL LETTER I +0x89 0x0419 # CYRILLIC CAPITAL LETTER SHORT I +0x8A 0x041A # CYRILLIC CAPITAL LETTER KA +0x8B 0x041B # CYRILLIC CAPITAL LETTER EL +0x8C 0x041C # CYRILLIC CAPITAL LETTER EM +0x8D 0x041D # CYRILLIC CAPITAL LETTER EN +0x8E 0x041E # CYRILLIC CAPITAL LETTER O +0x8F 0x041F # CYRILLIC CAPITAL LETTER PE +0x90 0x0420 # CYRILLIC CAPITAL LETTER ER +0x91 0x0421 # CYRILLIC CAPITAL LETTER ES +0x92 0x0422 # CYRILLIC CAPITAL LETTER TE +0x93 0x0423 # CYRILLIC CAPITAL LETTER U +0x94 0x0424 # CYRILLIC CAPITAL LETTER EF +0x95 0x0425 # CYRILLIC CAPITAL LETTER HA +0x96 0x0426 # CYRILLIC CAPITAL LETTER TSE +0x97 0x0427 # CYRILLIC CAPITAL LETTER CHE +0x98 0x0428 # CYRILLIC CAPITAL LETTER SHA +0x99 0x0429 # CYRILLIC CAPITAL LETTER SHCHA +0x9A 0x042A # CYRILLIC CAPITAL LETTER HARD SIGN +0x9B 0x042B # CYRILLIC CAPITAL LETTER YERU +0x9C 0x042C # CYRILLIC CAPITAL LETTER SOFT SIGN +0x9D 0x042D # CYRILLIC CAPITAL LETTER E +0x9E 0x042E # CYRILLIC CAPITAL LETTER YU +0x9F 0x042F # CYRILLIC CAPITAL LETTER YA +0xA0 0x2020 # DAGGER +0xA1 0x00B0 # DEGREE SIGN +0xA2 0x00A2 # CENT SIGN +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A7 # SECTION SIGN +0xA5 0x2022 # BULLET +0xA6 0x00B6 # PILCROW SIGN +0xA7 0x0406 # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0xA8 0x00AE # REGISTERED SIGN +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x2122 # TRADE MARK SIGN +0xAB 0x0402 # CYRILLIC CAPITAL LETTER DJE +0xAC 0x0452 # CYRILLIC SMALL LETTER DJE +0xAD 0x2260 # NOT EQUAL TO +0xAE 0x0403 # CYRILLIC CAPITAL LETTER GJE +0xAF 0x0453 # CYRILLIC SMALL LETTER GJE +0xB0 0x221E # INFINITY +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x2264 # LESS-THAN OR EQUAL TO +0xB3 0x2265 # GREATER-THAN OR EQUAL TO +0xB4 0x0456 # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +0xB5 0x00B5 # MICRO SIGN +0xB6 0x2202 # PARTIAL DIFFERENTIAL +0xB7 0x0408 # CYRILLIC CAPITAL LETTER JE +0xB8 0x0404 # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0xB9 0x0454 # CYRILLIC SMALL LETTER UKRAINIAN IE +0xBA 0x0407 # CYRILLIC CAPITAL LETTER YI +0xBB 0x0457 # CYRILLIC SMALL LETTER YI +0xBC 0x0409 # CYRILLIC CAPITAL LETTER LJE +0xBD 0x0459 # CYRILLIC SMALL LETTER LJE +0xBE 0x040A # CYRILLIC CAPITAL LETTER NJE +0xBF 0x045A # CYRILLIC SMALL LETTER NJE +0xC0 0x0458 # CYRILLIC SMALL LETTER JE +0xC1 0x0405 # CYRILLIC CAPITAL LETTER DZE +0xC2 0x00AC # NOT SIGN +0xC3 0x221A # SQUARE ROOT +0xC4 0x0192 # LATIN SMALL LETTER F WITH HOOK +0xC5 0x2248 # ALMOST EQUAL TO +0xC6 0x2206 # INCREMENT +0xC7 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xC8 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xC9 0x2026 # HORIZONTAL ELLIPSIS +0xCA 0x00A0 # NO-BREAK SPACE +0xCB 0x040B # CYRILLIC CAPITAL LETTER TSHE +0xCC 0x045B # CYRILLIC SMALL LETTER TSHE +0xCD 0x040C # CYRILLIC CAPITAL LETTER KJE +0xCE 0x045C # CYRILLIC SMALL LETTER KJE +0xCF 0x0455 # CYRILLIC SMALL LETTER DZE +0xD0 0x2013 # EN DASH +0xD1 0x2014 # EM DASH +0xD2 0x201C # LEFT DOUBLE QUOTATION MARK +0xD3 0x201D # RIGHT DOUBLE QUOTATION MARK +0xD4 0x2018 # LEFT SINGLE QUOTATION MARK +0xD5 0x2019 # RIGHT SINGLE QUOTATION MARK +0xD6 0x00F7 # DIVISION SIGN +0xD7 0x201E # DOUBLE LOW-9 QUOTATION MARK +0xD8 0x040E # CYRILLIC CAPITAL LETTER SHORT U +0xD9 0x045E # CYRILLIC SMALL LETTER SHORT U +0xDA 0x040F # CYRILLIC CAPITAL LETTER DZHE +0xDB 0x045F # CYRILLIC SMALL LETTER DZHE +0xDC 0x2116 # NUMERO SIGN +0xDD 0x0401 # CYRILLIC CAPITAL LETTER IO +0xDE 0x0451 # CYRILLIC SMALL LETTER IO +0xDF 0x044F # CYRILLIC SMALL LETTER YA +0xE0 0x0430 # CYRILLIC SMALL LETTER A +0xE1 0x0431 # CYRILLIC SMALL LETTER BE +0xE2 0x0432 # CYRILLIC SMALL LETTER VE +0xE3 0x0433 # CYRILLIC SMALL LETTER GHE +0xE4 0x0434 # CYRILLIC SMALL LETTER DE +0xE5 0x0435 # CYRILLIC SMALL LETTER IE +0xE6 0x0436 # CYRILLIC SMALL LETTER ZHE +0xE7 0x0437 # CYRILLIC SMALL LETTER ZE +0xE8 0x0438 # CYRILLIC SMALL LETTER I +0xE9 0x0439 # CYRILLIC SMALL LETTER SHORT I +0xEA 0x043A # CYRILLIC SMALL LETTER KA +0xEB 0x043B # CYRILLIC SMALL LETTER EL +0xEC 0x043C # CYRILLIC SMALL LETTER EM +0xED 0x043D # CYRILLIC SMALL LETTER EN +0xEE 0x043E # CYRILLIC SMALL LETTER O +0xEF 0x043F # CYRILLIC SMALL LETTER PE +0xF0 0x0440 # CYRILLIC SMALL LETTER ER +0xF1 0x0441 # CYRILLIC SMALL LETTER ES +0xF2 0x0442 # CYRILLIC SMALL LETTER TE +0xF3 0x0443 # CYRILLIC SMALL LETTER U +0xF4 0x0444 # CYRILLIC SMALL LETTER EF +0xF5 0x0445 # CYRILLIC SMALL LETTER HA +0xF6 0x0446 # CYRILLIC SMALL LETTER TSE +0xF7 0x0447 # CYRILLIC SMALL LETTER CHE +0xF8 0x0448 # CYRILLIC SMALL LETTER SHA +0xF9 0x0449 # CYRILLIC SMALL LETTER SHCHA +0xFA 0x044A # CYRILLIC SMALL LETTER HARD SIGN +0xFB 0x044B # CYRILLIC SMALL LETTER YERU +0xFC 0x044C # CYRILLIC SMALL LETTER SOFT SIGN +0xFD 0x044D # CYRILLIC SMALL LETTER E +0xFE 0x044E # CYRILLIC SMALL LETTER YU +0xFF 0x00A4 # CURRENCY SIGN diff --git a/charsets/mac-greek.txt b/charsets/mac-greek.txt new file mode 100644 index 0000000..f3374f1 --- /dev/null +++ b/charsets/mac-greek.txt @@ -0,0 +1,355 @@ +#======================================================================= +# File name: GREEK.TXT +# +# Contents: Map (external version) from Mac OS Greek +# character set to Unicode 2.1 and later. +# +# Copyright: (c) 1995-2002, 2005 by Apple Computer, Inc., all rights +# reserved. +# +# Contact: charsets@apple.com +# +# Changes: +# +# c02 2005-Apr-05 Update header comments. Matches internal xml +# and Text Encoding Converter 2.0. +# b3,c1 2002-Dec-19 Update to match changes in Mac OS Greek +# encoding for Mac OS 9.2.2 and later. +# Update URLs, notes. Matches internal +# utom. +# b02 1999-Sep-22 Update contact e-mail address. Matches +# internal utom, ufrm, and Text +# Encoding Converter version 1.5. +# n06 1998-Feb-05 Update to match internal utom, ufrm, +# and Text Encoding Converter versions 1.3: +# Change mapping for 0xAF from U+0387 to its +# canonical decomposition, U+00B7. Also +# update header comments to new format. +# n04 1995-Apr-15 First version (after fixing some typos). +# Matches internal ufrm. +# +# Standard header: +# ---------------- +# +# Apple, the Apple logo, and Macintosh are trademarks of Apple +# Computer, Inc., registered in the United States and other countries. +# Unicode is a trademark of Unicode Inc. For the sake of brevity, +# throughout this document, "Macintosh" can be used to refer to +# Macintosh computers and "Unicode" can be used to refer to the +# Unicode standard. +# +# Apple Computer, Inc. ("Apple") makes no warranty or representation, +# either express or implied, with respect to this document and the +# included data, its quality, accuracy, or fitness for a particular +# purpose. In no event will Apple be liable for direct, indirect, +# special, incidental, or consequential damages resulting from any +# defect or inaccuracy in this document or the included data. +# +# These mapping tables and character lists are subject to change. +# The latest tables should be available from the following: +# +# +# +# For general information about Mac OS encodings and these mapping +# tables, see the file "README.TXT". +# +# Format: +# ------- +# +# Three tab-separated columns; +# '#' begins a comment which continues to the end of the line. +# Column #1 is the Mac OS Greek code (in hex as 0xNN) +# Column #2 is the corresponding Unicode (in hex as 0xNNNN) +# Column #3 is a comment containing the Unicode name +# +# The entries are in Mac OS Greek code order. +# +# One of these mappings requires the use of a corporate character. +# See the file "CORPCHAR.TXT" and notes below. +# +# Control character mappings are not shown in this table, following +# the conventions of the standard UTC mapping tables. However, the +# Mac OS Greek character set uses the standard control characters at +# 0x00-0x1F and 0x7F. +# +# Notes on Mac OS Greek: +# ---------------------- +# +# This is a legacy Mac OS encoding; in the Mac OS X Carbon and Cocoa +# environments, it is only supported via transcoding to and from +# Unicode. +# +# Although a Mac OS script code is defined for Greek (smGreek = 6), +# the Greek localized system does not currently use it (the font +# family IDs are in the Mac OS Roman range). To determine if the +# Greek encoding is being used when the script code is smRoman (0), +# you must check if the system region code is 20, verGreece. +# +# The Mac OS Greek encoding is a superset of the repertoire of +# ISO 8859-7 (although characters are not at the same code points), +# except that LEFT & RIGHT SINGLE QUOTATION MARK replace the +# MODIFIER LETTER REVERSED COMMA & APOSTROPHE (spacing versions of +# Greek rough & smooth breathing marks) that are in ISO 8859-7. +# The added characters in Mac OS Greek include more punctuation and +# symbols and several accented Latin letters. +# +# Before Mac OS 9.2.2, code point 0x9C was SOFT HYPHEN (U+00AD), and +# code point 0xFF was undefined. In Mac OS 9.2.2 and later versions, +# SOFT HYPHEN was moved to 0xFF, and code point 0x9C was changed to be +# EURO SIGN (U+20AC); the standard Apple fonts are updated for Mac OS +# 9.2.2 to reflect this. There is a "no Euro sign" variant of the Mac +# OS Greek encoding that uses the older mapping; this can be used for +# older fonts. +# +# This "no Euro sign" variant of Mac OS Greek was the character set +# used by Mac OS Greek systems before 9.2.2 except for system 6.0.7, +# which used a variant character set but was quickly replaced with +# Greek system 6.0.7.1 using the no Euro sign" character set +# documented here. Greek system 4.1 used a variant Greek set that had +# ISO 8859-7 in 0xA0-0xFF (with some holes filled in with DTP +# characters), and Mac OS Roman accented Roman letters in 0x80-0x9F. +# +# Unicode mapping issues and notes: +# --------------------------------- +# +# Details of mapping changes in each version: +# ------------------------------------------- +# +# Changes from version b02 to version b03/c01: +# +# - The Mac OS Greek encoding changed for Mac OS 9.2.2 and later +# as follows: +# 0x9C, changed from 0x00AD SOFT HYPHEN to 0x20AC EURO SIGN +# 0xFF, changed from undefined to 0x00AD SOFT HYPHEN +# +# Changes from version n04 to version n06: +# +# - Change mapping of 0xAF from U+0387 to its canonical +# decomposition, U+00B7. +# +################## + +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +# +0x80 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0x81 0x00B9 # SUPERSCRIPT ONE +0x82 0x00B2 # SUPERSCRIPT TWO +0x83 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0x84 0x00B3 # SUPERSCRIPT THREE +0x85 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0x86 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0x87 0x0385 # GREEK DIALYTIKA TONOS +0x88 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0x89 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0x8A 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0x8B 0x0384 # GREEK TONOS +0x8C 0x00A8 # DIAERESIS +0x8D 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0x8E 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0x8F 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0x90 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0x91 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0x92 0x00A3 # POUND SIGN +0x93 0x2122 # TRADE MARK SIGN +0x94 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0x95 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0x96 0x2022 # BULLET +0x97 0x00BD # VULGAR FRACTION ONE HALF +0x98 0x2030 # PER MILLE SIGN +0x99 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0x9A 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0x9B 0x00A6 # BROKEN BAR +0x9C 0x20AC # EURO SIGN # before Mac OS 9.2.2, was SOFT HYPHEN +0x9D 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0x9E 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0x9F 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xA0 0x2020 # DAGGER +0xA1 0x0393 # GREEK CAPITAL LETTER GAMMA +0xA2 0x0394 # GREEK CAPITAL LETTER DELTA +0xA3 0x0398 # GREEK CAPITAL LETTER THETA +0xA4 0x039B # GREEK CAPITAL LETTER LAMDA +0xA5 0x039E # GREEK CAPITAL LETTER XI +0xA6 0x03A0 # GREEK CAPITAL LETTER PI +0xA7 0x00DF # LATIN SMALL LETTER SHARP S +0xA8 0x00AE # REGISTERED SIGN +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x03A3 # GREEK CAPITAL LETTER SIGMA +0xAB 0x03AA # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +0xAC 0x00A7 # SECTION SIGN +0xAD 0x2260 # NOT EQUAL TO +0xAE 0x00B0 # DEGREE SIGN +0xAF 0x00B7 # MIDDLE DOT +0xB0 0x0391 # GREEK CAPITAL LETTER ALPHA +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x2264 # LESS-THAN OR EQUAL TO +0xB3 0x2265 # GREATER-THAN OR EQUAL TO +0xB4 0x00A5 # YEN SIGN +0xB5 0x0392 # GREEK CAPITAL LETTER BETA +0xB6 0x0395 # GREEK CAPITAL LETTER EPSILON +0xB7 0x0396 # GREEK CAPITAL LETTER ZETA +0xB8 0x0397 # GREEK CAPITAL LETTER ETA +0xB9 0x0399 # GREEK CAPITAL LETTER IOTA +0xBA 0x039A # GREEK CAPITAL LETTER KAPPA +0xBB 0x039C # GREEK CAPITAL LETTER MU +0xBC 0x03A6 # GREEK CAPITAL LETTER PHI +0xBD 0x03AB # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +0xBE 0x03A8 # GREEK CAPITAL LETTER PSI +0xBF 0x03A9 # GREEK CAPITAL LETTER OMEGA +0xC0 0x03AC # GREEK SMALL LETTER ALPHA WITH TONOS +0xC1 0x039D # GREEK CAPITAL LETTER NU +0xC2 0x00AC # NOT SIGN +0xC3 0x039F # GREEK CAPITAL LETTER OMICRON +0xC4 0x03A1 # GREEK CAPITAL LETTER RHO +0xC5 0x2248 # ALMOST EQUAL TO +0xC6 0x03A4 # GREEK CAPITAL LETTER TAU +0xC7 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xC8 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xC9 0x2026 # HORIZONTAL ELLIPSIS +0xCA 0x00A0 # NO-BREAK SPACE +0xCB 0x03A5 # GREEK CAPITAL LETTER UPSILON +0xCC 0x03A7 # GREEK CAPITAL LETTER CHI +0xCD 0x0386 # GREEK CAPITAL LETTER ALPHA WITH TONOS +0xCE 0x0388 # GREEK CAPITAL LETTER EPSILON WITH TONOS +0xCF 0x0153 # LATIN SMALL LIGATURE OE +0xD0 0x2013 # EN DASH +0xD1 0x2015 # HORIZONTAL BAR +0xD2 0x201C # LEFT DOUBLE QUOTATION MARK +0xD3 0x201D # RIGHT DOUBLE QUOTATION MARK +0xD4 0x2018 # LEFT SINGLE QUOTATION MARK +0xD5 0x2019 # RIGHT SINGLE QUOTATION MARK +0xD6 0x00F7 # DIVISION SIGN +0xD7 0x0389 # GREEK CAPITAL LETTER ETA WITH TONOS +0xD8 0x038A # GREEK CAPITAL LETTER IOTA WITH TONOS +0xD9 0x038C # GREEK CAPITAL LETTER OMICRON WITH TONOS +0xDA 0x038E # GREEK CAPITAL LETTER UPSILON WITH TONOS +0xDB 0x03AD # GREEK SMALL LETTER EPSILON WITH TONOS +0xDC 0x03AE # GREEK SMALL LETTER ETA WITH TONOS +0xDD 0x03AF # GREEK SMALL LETTER IOTA WITH TONOS +0xDE 0x03CC # GREEK SMALL LETTER OMICRON WITH TONOS +0xDF 0x038F # GREEK CAPITAL LETTER OMEGA WITH TONOS +0xE0 0x03CD # GREEK SMALL LETTER UPSILON WITH TONOS +0xE1 0x03B1 # GREEK SMALL LETTER ALPHA +0xE2 0x03B2 # GREEK SMALL LETTER BETA +0xE3 0x03C8 # GREEK SMALL LETTER PSI +0xE4 0x03B4 # GREEK SMALL LETTER DELTA +0xE5 0x03B5 # GREEK SMALL LETTER EPSILON +0xE6 0x03C6 # GREEK SMALL LETTER PHI +0xE7 0x03B3 # GREEK SMALL LETTER GAMMA +0xE8 0x03B7 # GREEK SMALL LETTER ETA +0xE9 0x03B9 # GREEK SMALL LETTER IOTA +0xEA 0x03BE # GREEK SMALL LETTER XI +0xEB 0x03BA # GREEK SMALL LETTER KAPPA +0xEC 0x03BB # GREEK SMALL LETTER LAMDA +0xED 0x03BC # GREEK SMALL LETTER MU +0xEE 0x03BD # GREEK SMALL LETTER NU +0xEF 0x03BF # GREEK SMALL LETTER OMICRON +0xF0 0x03C0 # GREEK SMALL LETTER PI +0xF1 0x03CE # GREEK SMALL LETTER OMEGA WITH TONOS +0xF2 0x03C1 # GREEK SMALL LETTER RHO +0xF3 0x03C3 # GREEK SMALL LETTER SIGMA +0xF4 0x03C4 # GREEK SMALL LETTER TAU +0xF5 0x03B8 # GREEK SMALL LETTER THETA +0xF6 0x03C9 # GREEK SMALL LETTER OMEGA +0xF7 0x03C2 # GREEK SMALL LETTER FINAL SIGMA +0xF8 0x03C7 # GREEK SMALL LETTER CHI +0xF9 0x03C5 # GREEK SMALL LETTER UPSILON +0xFA 0x03B6 # GREEK SMALL LETTER ZETA +0xFB 0x03CA # GREEK SMALL LETTER IOTA WITH DIALYTIKA +0xFC 0x03CB # GREEK SMALL LETTER UPSILON WITH DIALYTIKA +0xFD 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0xFE 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +0xFF 0x00AD # SOFT HYPHEN # before Mac OS 9.2.2, was undefined diff --git a/charsets/mac-hebrew.txt b/charsets/mac-hebrew.txt new file mode 100644 index 0000000..09b0bd3 --- /dev/null +++ b/charsets/mac-hebrew.txt @@ -0,0 +1,601 @@ +#======================================================================= +# File name: HEBREW.TXT +# +# Contents: Map (external version) from Mac OS Hebrew +# character set to Unicode 2.1 and later. +# +# Copyright: (c) 1995-2002, 2005 by Apple Computer, Inc., all rights +# reserved. +# +# Contact: charsets@apple.com +# +# Changes: +# +# c02 2005-Apr-05 Update header comments; add section on +# roundtrip considerations. Matches internal +# xml and Text Encoding Converter 2.0. +# b3,c1 2002-Dec-19 Don't require left-right context for digits +# 0x30-0x39. Change mapping of 0x81 to use +# decomposition. Reverse the mappings of 0xA8, +# 0xA9. Update URLs, notes. Matches internal +# utom. +# b02 1999-Sep-22 Update contact e-mail address. Matches +# internal utom, ufrm, and Text +# Encoding Converter version 1.5. +# n03 1998-Feb-05 Show required Unicode character +# directionality in a different way. Update +# mappings for 0xC0 and 0xDE to use +# transcoding hints; matches internal utom, +# ufrm, and Text Encoding Converter +# version 1.3. Rewrite header comments. +# n01 1995-Nov-15 First version. Matches internal ufrm. +# +# Standard header: +# ---------------- +# +# Apple, the Apple logo, and Macintosh are trademarks of Apple +# Computer, Inc., registered in the United States and other countries. +# Unicode is a trademark of Unicode Inc. For the sake of brevity, +# throughout this document, "Macintosh" can be used to refer to +# Macintosh computers and "Unicode" can be used to refer to the +# Unicode standard. +# +# Apple Computer, Inc. ("Apple") makes no warranty or representation, +# either express or implied, with respect to this document and the +# included data, its quality, accuracy, or fitness for a particular +# purpose. In no event will Apple be liable for direct, indirect, +# special, incidental, or consequential damages resulting from any +# defect or inaccuracy in this document or the included data. +# +# These mapping tables and character lists are subject to change. +# The latest tables should be available from the following: +# +# +# +# For general information about Mac OS encodings and these mapping +# tables, see the file "README.TXT". +# +# Format: +# ------- +# +# Three tab-separated columns; +# '#' begins a comment which continues to the end of the line. +# Column #1 is the Mac OS Hebrew code (in hex as 0xNN). +# Column #2 is the corresponding Unicode or Unicode sequence (in +# hex as 0xNNNN, 0xNNNN+0xNNNN, etc.). Sequences of up to 3 +# Unicode characters are used here. A single Unicode character +# may be preceded by a tag indicating required directionality +# (i.e. +0xNNNN or +0xNNNN). +# Column #3 is a comment containing the Unicode name. +# +# The entries are in Mac OS Hebrew code order. +# +# Some of these mappings require the use of corporate characters. +# See the file "CORPCHAR.TXT" and notes below. +# +# Control character mappings are not shown in this table, following +# the conventions of the standard UTC mapping tables. However, the +# Mac OS Hebrew character set uses the standard control characters at +# 0x00-0x1F and 0x7F. +# +# Notes on Mac OS Hebrew: +# ----------------------- +# +# This is a legacy Mac OS encoding; in the Mac OS X Carbon and Cocoa +# environments, it is only supported via transcoding to and from +# Unicode. +# +# 1. General +# +# The Mac OS Hebrew character set supports the Hebrew and Yiddish +# languages. It incorporates the Hebrew letter repertoire of +# ISO 8859-8, and uses the same code points for them, 0xE0-0xFA. +# It also incorporates the ASCII character set. In addition, the +# Mac OS Hebrew character set includes the following: +# +# - Hebrew points (nikud marks) at 0xC6, 0xCB-0xCF and 0xD8-0xDF. +# These are non-spacing combining marks. Note that the RAFE point +# at 0xD8 is not displayed correctly in some fonts, and cannot be +# typed using the keyboard layouts in the current Hebrew localized +# systems. Also note: The character given in Unicode as QAMATS +# (U+05B8) actually refers to two different sounds, depending on +# context. For example, when ALEF is followed by QAMATS, the QAMATS +# can actually refer to two different sounds depending on the +# following letters. The Mac OS Hebrew character set separately +# encodes these two sounds for the same graphic shape, as "qamats" +# (0xCB) and "qamats qatan" (0xDE). The "qamats" character is more +# common, so it is mapped to the Unicode QAMATS; "qamats qatan" can +# only be used with a limited number of characters, and it is +# mapped using a corporate-zone variant tag (see below). +# +# - Various Hebrew ligatures at 0x81, 0xC0, 0xC7, 0xC8, 0xD6, and +# 0xD7. Also note that the Yiddish YOD YOD PATAH ligature at 0x81 +# is missing in some fonts. +# +# - The NEW SHEQEL SIGN at 0xA6. +# +# - Latin characters with diacritics at 0x80 and 0x82-0x9F. However, +# most of these cannot be typed using the keyboard layouts in the +# Hebrew localized systems. +# +# - Right-left versions of certain ASCII punctuation, symbols and +# digits: 0xA0-0xA5, 0xA7-0xBF, 0xFB-0xFF. See below. +# +# - Miscellaneous additional punctuation at 0xC1, 0xC9, 0xCA, and +# 0xD0-0xD5. There is a variant of the Hebrew encoding in which +# the LEFT SINGLE QUOTATION MARK at 0xD4 is replaced by FIGURE +# SPACE. The glyphs for some of the other punctuation characters +# are missing in some fonts. +# +# - Four obsolete characters at 0xC2-0xC5 known as canorals (not to +# be confused with cantillation marks!). These were used for +# manual positioning of nikud marks before System 7.1 (at which +# point nikud positioning became automatic with WorldScript.). +# +# 2. Directional characters and roundtrip fidelity +# +# The Mac OS Hebrew character set was developed around 1987. At that +# time the bidirectional line line layout algorithm used in the Mac OS +# Hebrew system was fairly simple; it used only a few direction +# classes (instead of the 19 now used in the Unicode bidirectional +# algorithm). In order to permit users to handle some tricky layou +# problems, certain punctuation, symbol, and digit characters have +# duplicate code points, one with a left-right direction attribute and +# the other with a right-left direction attribute. +# +# For example, plus sign is encoded at 0x2B with a left-right +# attribute, and at 0xAB with a right-left attribute. However, there +# is only one PLUS SIGN character in Unicode. This leads to some +# interesting problems when mapping between Mac OS Hebrew and Unicode; +# see below. +# +# A related problem is that even when a particular character is +# encoded only once in Mac OS Hebrew, it may have a different +# direction attribute than the corresponding Unicode character. +# +# For example, the Mac OS Hebrew character at 0xC9 is HORIZONTAL +# ELLIPSIS with strong right-left direction. However, the Unicode +# character HORIZONTAL ELLIPSIS has direction class neutral. +# +# 3. Font variants +# +# The table in this file gives the Unicode mappings for the standard +# Mac OS Hebrew encoding. This encoding is supported by many of the +# Apple fonts (including all of the fonts in the Hebrew Language Kit), +# and is the encoding supported by the text processing utilities. +# However, some TrueType fonts provided with the localized Hebrew +# system implement a slightly different encoding; the difference is +# only in one code point, 0xD4. For the standard variant, this is: +# 0xD4 -> +0x2018 LEFT SINGLE QUOTATION MARK, right-left +# +# The TrueType variant is used by the following TrueType fonts from +# the localized system: Caesarea, Carmel Book, Gilboa, Ramat Sharon, +# and Sinai Book. For these, 0xD4 is as follows: +# 0xD4 -> +0x2007 FIGURE SPACE, right-left +# +# Unicode mapping issues and notes: +# --------------------------------- +# +# 1. Matching the direction of Mac OS Hebrew characters +# +# When Mac OS Hebrew encodes a character twice but with different +# direction attributes for the two code points - as in the case of +# plus sign mentioned above - we need a way to map both Mac OS Hebrew +# code points to Unicode and back again without loss of information. +# With the plus sign, for example, mapping one of the Mac OS Hebrew +# characters to a code in the Unicode corporate use zone is +# undesirable, since both of the plus sign characters are likely to +# be used in text that is interchanged. +# +# The problem is solved with the use of direction override characters +# and direction-dependent mappings. When mapping from Mac OS Hebrew +# to Unicode, we use direction overrides as necessary to force the +# direction of the resulting Unicode characters. +# +# The required direction is indicated by a direction tag in the +# mappings. A tag of means the corresponding Unicode character +# must have a strong left-right context, and a tag of indicates +# a right-left context. +# +# For example, the mapping of 0x2B is given as +0x002B; the +# mapping of 0xAB is given as +0x002B. If we map an isolated +# instance of 0x2B to Unicode, it should be mapped as follows (LRO +# indicates LEFT-RIGHT OVERRIDE, PDF indicates POP DIRECTION +# FORMATTING): +# +# 0x2B -> 0x202D (LRO) + 0x002B (PLUS SIGN) + 0x202C (PDF) +# +# When mapping several characters in a row that require direction +# forcing, the overrides need only be used at the beginning and end. +# For example: +# +# 0x24 0x20 0x28 0x29 -> 0x202D 0x0024 0x0020 0x0028 0x0029 0x202C +# +# If neutral characters that require direction forcing are already +# between strong-direction characters with matching directionality, +# then direction overrides need not be used. Direction overrides are +# always needed to map the right-left digits at 0xB0-0xB9. +# +# When mapping from Unicode to Mac OS Hebrew, the Unicode +# bidirectional algorithm should be used to determine resolved +# direction of the Unicode characters. The mapping from Unicode to +# Mac OS Hebrew can then be disambiguated by the use of the resolved +# direction: +# +# Unicode 0x002B -> Mac OS Hebrew 0x2B (if L) or 0xAB (if R) +# +# However, this also means the direction override characters should +# be discarded when mapping from Unicode to Mac OS Hebrew (after +# they have been used to determine resolved direction), since the +# direction override information is carried by the code point itself. +# +# Even when direction overrides are not needed for roundtrip +# fidelity, they are sometimes used when mapping Mac OS Hebrew +# characters to Unicode in order to achieve similar text layout with +# the resulting Unicode text. For example, the single Mac OS Hebrew +# ellipsis character has direction class right-left,and there is no +# left-right version. However, the Unicode HORIZONTAL ELLIPSIS +# character has direction class neutral (which means it may end up +# with a resolved direction of left-right if surrounded by left-right +# characters). When mapping the Mac OS Hebrew ellipsis to Unicode, it +# is surrounded with a direction override to help preserve proper +# text layout. The resolved direction is not needed or used when +# mapping the Unicode HORIZONTAL ELLIPSIS back to Mac OS Hebrew. +# +# 2. Use of corporate-zone Unicodes +# +# The goals in the mappings provided here are: +# - Ensure roundtrip mapping from every character in the Mac OS +# Hebrew character set to Unicode and back +# - Use standard Unicode characters as much as possible, to +# maximize interchangeability of the resulting Unicode text. +# Whenever possible, avoid having content carried by private-use +# characters. +# +# Some of the characters in the Mac OS Hebrew character set do not +# correspond to distinct, single Unicode characters. To map these +# and satisfy both goals above, we employ various strategies. +# +# a) If possible, use private use characters in combination with +# standard Unicode characters to mark variants of the standard +# Unicode character. +# +# Apple has defined a block of 32 corporate characters as "transcoding +# hints." These are used in combination with standard Unicode characters +# to force them to be treated in a special way for mapping to other +# encodings; they have no other effect. Sixteen of these transcoding +# hints are "grouping hints" - they indicate that the next 2-4 Unicode +# characters should be treated as a single entity for transcoding. The +# other sixteen transcoding hints are "variant tags" - they are like +# combining characters, and can follow a standard Unicode (or a sequence +# consisting of a base character and other combining characters) to +# cause it to be treated in a special way for transcoding. These always +# terminate a combining-character sequence. +# +# Two transcoding hints are used in this mapping table: a grouping hint +# and a variant tag: +# hint: +# 0xF86A group next 2 characters, right-left directionality +# 0xF87F variant tag +# +# In Mac OS Hebrew, 0xC0 is a ligature for lamed holam. This can also +# be represented in Mac OS Hebrew as 0xEC+0xDD, using separate +# characters for lamed and holam. The latter sequence is mapped to +# Unicode as 0x05DC+0x05B9, i.e. as the sequence HEBREW LETTER LAMED + +# HEBREW POINT HOLAM. We want to map the ligature 0xC0 using the same +# standard Unicode characters, but for round-trip fidelity we need to +# distinguish it from the mapping of the sequence 0xEC+0xDD. Thus for +# 0xC0 we use a grouping hint, and map as follows: +# +# 0xC0 -> 0xF86A+0x05DC+0x05B9 +# +# The variant tag is used for "qamats qatan" to mark it as an alternate +# for HEBREW POINT QAMATS, as follows: +# +# 0xDE -> 0x05B8+0xF87F +# +# b) Otherwise, use private use characters by themselves to map Mac OS +# Hebrew characters which have no relationship to any standard Unicode +# character. +# +# The following additional corporate zone Unicode characters are used +# for this purpose here (to map the obsolete "canorals", see above): +# +# 0xF89B Hebrew canoral 1 +# 0xF89C Hebrew canoral 2 +# 0xF89D Hebrew canoral 3 +# 0xF89E Hebrew canoral 4 +# +# 3. Roundtrip considerations when mapping to decomposed Unicode +# +# Both Mac OS Hebrew and Unicode provide multiple ways of representing +# certain letter-and-point combinations. For example, HEBREW LETTER +# VAV WITH HOLAM can be represented in Unicode as the single character +# 0xFB4B or as the sequence 0x05D5 0x05B9; similarly, it can be +# represented in Mac OS Hebrew as 0xC7 or as the sequence 0xE5 0xDD. +# This leads to some roundtrip problems. First note that we have the +# following mappings without such problems: +# +# Mac standard decomp. of reverse map +# OS Unicode mapping std. mapping of decomp. +# ---- ---------------------------------- ------------- ----------- +# 0xC6 0x05BC ... POINT DAGESH OR MAPIQ 0x05BC (same) 0xC6 +# 0xE5 0x05D5 ... LETTER VAV 0x05D5 (same) 0xE5 +# 0xDD 0x05B9 ... POINT HOLAM 0x05B9 (same) 0xDD +# +# However, those mappings above cause roundtrip problems for the +# the following mappings if they are decomposed: +# +# Mac standard decomp. of reverse map +# OS Unicode mapping std. mapping of decomp. +# ---- ---------------------------------- ------------- ----------- +# 0xC7 0xFB4B ... LETTER VAV WITH HOLAM 0x05D5 0x05B9 0xE5 0xDD +# 0xC8 0xFB35 ... LETTER VAV WITH DAGESH 0x05D5 0x05BC 0xE5 0xC6 +# +# One solution is to use a grouping transcoding hint with the two +# decompositions above to mark the decomposed sequence for special +# treatment in transcoding. This yields the following mappings to +# decomposed Unicode: +# +# Mac decomposed +# OS Unicode mapping +# ---- -------------------- +# 0xC7 0xF86A 0x05D5 0x05B9 +# 0xC8 0xF86A 0x05D5 0x05BC +# +# Details of mapping changes in each version: +# ------------------------------------------- +# +# Changes from version b02 to version b03/c01: +# +# - Stop specifying left-right context for digits 0x30-0x39, since the +# corresponding Unicodes 0x0030-0x0039 already have left-right +# directionality. +# +# - Change mapping of 0x81 from 0xFB1F HEBREW LIGATURE YIDDISH YOD YOD +# PATAH to its canonical decomposition 0x05F2+0x05B7 to improve +# cross-platform compatibility (Windows doesn't handle 0xFB1F) +# +# - Interchange the mappings of 0xA8 and 0xA9 to obtain the correct +# open/close behavior; they work differently than in Mac Arabic. +# The old mapping was +# 0xA8 +0x0028 # LEFT PARENTHESIS, right-left +# 0xA9 +0x0029 # RIGHT PARENTHESIS, right-left +# and the new mapping is +# 0xA8 +0x0029 # RIGHT PARENTHESIS, right-left +# 0xA9 +0x0028 # LEFT PARENTHESIS, right-left +# +# Changes from version n01 to version n03: +# +# - Change mapping for 0xC0 from single corporate character to +# grouping hint plus standard Unicodes +# +# - Change mapping for 0xDE from single corporate character to +# standard Unicode plus variant tag +# +################## + +0x20 +0x0020 # SPACE, left-right +0x21 +0x0021 # EXCLAMATION MARK, left-right +0x22 +0x0022 # QUOTATION MARK, left-right +0x23 +0x0023 # NUMBER SIGN, left-right +0x24 +0x0024 # DOLLAR SIGN, left-right +0x25 +0x0025 # PERCENT SIGN, left-right +0x26 0x0026 # AMPERSAND +0x27 +0x0027 # APOSTROPHE, left-right +0x28 +0x0028 # LEFT PARENTHESIS, left-right +0x29 +0x0029 # RIGHT PARENTHESIS, left-right +0x2A +0x002A # ASTERISK, left-right +0x2B +0x002B # PLUS SIGN, left-right +0x2C +0x002C # COMMA, left-right +0x2D +0x002D # HYPHEN-MINUS, left-right +0x2E +0x002E # FULL STOP, left-right +0x2F +0x002F # SOLIDUS, left-right +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A +0x003A # COLON, left-right +0x3B +0x003B # SEMICOLON, left-right +0x3C +0x003C # LESS-THAN SIGN, left-right +0x3D +0x003D # EQUALS SIGN, left-right +0x3E +0x003E # GREATER-THAN SIGN, left-right +0x3F +0x003F # QUESTION MARK, left-right +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B +0x005B # LEFT SQUARE BRACKET, left-right +0x5C 0x005C # REVERSE SOLIDUS +0x5D +0x005D # RIGHT SQUARE BRACKET, left-right +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B +0x007B # LEFT CURLY BRACKET, left-right +0x7C +0x007C # VERTICAL LINE, left-right +0x7D +0x007D # RIGHT CURLY BRACKET, left-right +0x7E 0x007E # TILDE +# +0x80 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0x81 0x05F2+0x05B7 # HEBREW LIGATURE YIDDISH YOD YOD PATAH +0x82 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0x83 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0x84 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0x85 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0x86 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0x87 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0x88 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0x89 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0x8A 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0x8B 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0x8C 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0x8D 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0x8E 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0x8F 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0x90 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0x91 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0x92 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0x93 0x00EC # LATIN SMALL LETTER I WITH GRAVE +0x94 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0x95 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0x96 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0x97 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0x98 0x00F2 # LATIN SMALL LETTER O WITH GRAVE +0x99 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0x9A 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0x9B 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0x9C 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0x9D 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0x9E 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0x9F 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xA0 +0x0020 # SPACE, right-left +0xA1 +0x0021 # EXCLAMATION MARK, right-left +0xA2 +0x0022 # QUOTATION MARK, right-left +0xA3 +0x0023 # NUMBER SIGN, right-left +0xA4 +0x0024 # DOLLAR SIGN, right-left +0xA5 +0x0025 # PERCENT SIGN, right-left +0xA6 0x20AA # NEW SHEQEL SIGN +0xA7 +0x0027 # APOSTROPHE, right-left +0xA8 +0x0029 # RIGHT PARENTHESIS, right-left # close parenthesis +0xA9 +0x0028 # LEFT PARENTHESIS, right-left # open parenthesis +0xAA +0x002A # ASTERISK, right-left +0xAB +0x002B # PLUS SIGN, right-left +0xAC +0x002C # COMMA, right-left +0xAD +0x002D # HYPHEN-MINUS, right-left +0xAE +0x002E # FULL STOP, right-left +0xAF +0x002F # SOLIDUS, right-left +0xB0 +0x0030 # DIGIT ZERO, right-left (need override) +0xB1 +0x0031 # DIGIT ONE, right-left (need override) +0xB2 +0x0032 # DIGIT TWO, right-left (need override) +0xB3 +0x0033 # DIGIT THREE, right-left (need override) +0xB4 +0x0034 # DIGIT FOUR, right-left (need override) +0xB5 +0x0035 # DIGIT FIVE, right-left (need override) +0xB6 +0x0036 # DIGIT SIX, right-left (need override) +0xB7 +0x0037 # DIGIT SEVEN, right-left (need override) +0xB8 +0x0038 # DIGIT EIGHT, right-left (need override) +0xB9 +0x0039 # DIGIT NINE, right-left (need override) +0xBA +0x003A # COLON, right-left +0xBB +0x003B # SEMICOLON, right-left +0xBC +0x003C # LESS-THAN SIGN, right-left +0xBD +0x003D # EQUALS SIGN, right-left +0xBE +0x003E # GREATER-THAN SIGN, right-left +0xBF +0x003F # QUESTION MARK, right-left +0xC0 0xF86A+0x05DC+0x05B9 # Hebrew ligature lamed holam +0xC1 +0x201E # DOUBLE LOW-9 QUOTATION MARK, right-left +0xC2 0xF89B # Hebrew canoral 1 +0xC3 0xF89C # Hebrew canoral 2 +0xC4 0xF89D # Hebrew canoral 3 +0xC5 0xF89E # Hebrew canoral 4 +0xC6 0x05BC # HEBREW POINT DAGESH OR MAPIQ +0xC7 0xFB4B # HEBREW LETTER VAV WITH HOLAM +0xC8 0xFB35 # HEBREW LETTER VAV WITH DAGESH +0xC9 +0x2026 # HORIZONTAL ELLIPSIS, right-left +0xCA +0x00A0 # NO-BREAK SPACE, right-left +0xCB 0x05B8 # HEBREW POINT QAMATS +0xCC 0x05B7 # HEBREW POINT PATAH +0xCD 0x05B5 # HEBREW POINT TSERE +0xCE 0x05B6 # HEBREW POINT SEGOL +0xCF 0x05B4 # HEBREW POINT HIRIQ +0xD0 +0x2013 # EN DASH, right-left +0xD1 +0x2014 # EM DASH, right-left +0xD2 +0x201C # LEFT DOUBLE QUOTATION MARK, right-left +0xD3 +0x201D # RIGHT DOUBLE QUOTATION MARK, right-left +0xD4 +0x2018 # LEFT SINGLE QUOTATION MARK, right-left +0xD5 +0x2019 # RIGHT SINGLE QUOTATION MARK, right-left +0xD6 0xFB2A # HEBREW LETTER SHIN WITH SHIN DOT +0xD7 0xFB2B # HEBREW LETTER SHIN WITH SIN DOT +0xD8 0x05BF # HEBREW POINT RAFE +0xD9 0x05B0 # HEBREW POINT SHEVA +0xDA 0x05B2 # HEBREW POINT HATAF PATAH +0xDB 0x05B1 # HEBREW POINT HATAF SEGOL +0xDC 0x05BB # HEBREW POINT QUBUTS +0xDD 0x05B9 # HEBREW POINT HOLAM +0xDE 0x05B8+0xF87F # HEBREW POINT QAMATS, alternate form "qamats qatan" +0xDF 0x05B3 # HEBREW POINT HATAF QAMATS +0xE0 0x05D0 # HEBREW LETTER ALEF +0xE1 0x05D1 # HEBREW LETTER BET +0xE2 0x05D2 # HEBREW LETTER GIMEL +0xE3 0x05D3 # HEBREW LETTER DALET +0xE4 0x05D4 # HEBREW LETTER HE +0xE5 0x05D5 # HEBREW LETTER VAV +0xE6 0x05D6 # HEBREW LETTER ZAYIN +0xE7 0x05D7 # HEBREW LETTER HET +0xE8 0x05D8 # HEBREW LETTER TET +0xE9 0x05D9 # HEBREW LETTER YOD +0xEA 0x05DA # HEBREW LETTER FINAL KAF +0xEB 0x05DB # HEBREW LETTER KAF +0xEC 0x05DC # HEBREW LETTER LAMED +0xED 0x05DD # HEBREW LETTER FINAL MEM +0xEE 0x05DE # HEBREW LETTER MEM +0xEF 0x05DF # HEBREW LETTER FINAL NUN +0xF0 0x05E0 # HEBREW LETTER NUN +0xF1 0x05E1 # HEBREW LETTER SAMEKH +0xF2 0x05E2 # HEBREW LETTER AYIN +0xF3 0x05E3 # HEBREW LETTER FINAL PE +0xF4 0x05E4 # HEBREW LETTER PE +0xF5 0x05E5 # HEBREW LETTER FINAL TSADI +0xF6 0x05E6 # HEBREW LETTER TSADI +0xF7 0x05E7 # HEBREW LETTER QOF +0xF8 0x05E8 # HEBREW LETTER RESH +0xF9 0x05E9 # HEBREW LETTER SHIN +0xFA 0x05EA # HEBREW LETTER TAV +0xFB +0x007D # RIGHT CURLY BRACKET, right-left +0xFC +0x005D # RIGHT SQUARE BRACKET, right-left +0xFD +0x007B # LEFT CURLY BRACKET, right-left +0xFE +0x005B # LEFT SQUARE BRACKET, right-left +0xFF +0x007C # VERTICAL LINE, right-left diff --git a/charsets/mac-roman.txt b/charsets/mac-roman.txt new file mode 100644 index 0000000..5b3b8b4 --- /dev/null +++ b/charsets/mac-roman.txt @@ -0,0 +1,370 @@ +#======================================================================= +# File name: ROMAN.TXT +# +# Contents: Map (external version) from Mac OS Roman +# character set to Unicode 2.1 and later. +# +# Copyright: (c) 1994-2002, 2005 by Apple Computer, Inc., all rights +# reserved. +# +# Contact: charsets@apple.com +# +# Changes: +# +# c02 2005-Apr-05 Update header comments. Matches internal xml +# and Text Encoding Converter 2.0. +# b4,c1 2002-Dec-19 Update URLs, notes. Matches internal +# utom. +# b03 1999-Sep-22 Update contact e-mail address. Matches +# internal utom, ufrm, and Text +# Encoding Converter version 1.5. +# b02 1998-Aug-18 Encoding changed for Mac OS 8.5; change +# mapping of 0xDB from CURRENCY SIGN to +# EURO SIGN. Matches internal utom, +# ufrm. +# n08 1998-Feb-05 Minor update to header comments +# n06 1997-Dec-14 Add warning about future changes to 0xDB +# from CURRENCY SIGN to EURO SIGN. Clarify +# some header information +# n04 1997-Dec-01 Update to match internal utom, ufrm: +# Change standard mapping for 0xBD from U+2126 +# to its canonical decomposition, U+03A9. +# n03 1995-Apr-15 First version (after fixing some typos). +# Matches internal ufrm. +# +# Standard header: +# ---------------- +# +# Apple, the Apple logo, and Macintosh are trademarks of Apple +# Computer, Inc., registered in the United States and other countries. +# Unicode is a trademark of Unicode Inc. For the sake of brevity, +# throughout this document, "Macintosh" can be used to refer to +# Macintosh computers and "Unicode" can be used to refer to the +# Unicode standard. +# +# Apple Computer, Inc. ("Apple") makes no warranty or representation, +# either express or implied, with respect to this document and the +# included data, its quality, accuracy, or fitness for a particular +# purpose. In no event will Apple be liable for direct, indirect, +# special, incidental, or consequential damages resulting from any +# defect or inaccuracy in this document or the included data. +# +# These mapping tables and character lists are subject to change. +# The latest tables should be available from the following: +# +# +# +# For general information about Mac OS encodings and these mapping +# tables, see the file "README.TXT". +# +# Format: +# ------- +# +# Three tab-separated columns; +# '#' begins a comment which continues to the end of the line. +# Column #1 is the Mac OS Roman code (in hex as 0xNN) +# Column #2 is the corresponding Unicode (in hex as 0xNNNN) +# Column #3 is a comment containing the Unicode name +# +# The entries are in Mac OS Roman code order. +# +# One of these mappings requires the use of a corporate character. +# See the file "CORPCHAR.TXT" and notes below. +# +# Control character mappings are not shown in this table, following +# the conventions of the standard UTC mapping tables. However, the +# Mac OS Roman character set uses the standard control characters at +# 0x00-0x1F and 0x7F. +# +# Notes on Mac OS Roman: +# ---------------------- +# +# This is a legacy Mac OS encoding; in the Mac OS X Carbon and Cocoa +# environments, it is only supported directly in programming +# interfaces for QuickDraw Text, the Script Manager, and related +# Text Utilities. For other purposes it is supported via transcoding +# to and from Unicode. +# +# This character set is used for at least the following Mac OS +# localizations: U.S., British, Canadian French, French, Swiss +# French, German, Swiss German, Italian, Swiss Italian, Dutch, +# Swedish, Norwegian, Danish, Finnish, Spanish, Catalan, +# Portuguese, Brazilian, and the default International system. +# +# Variants of Mac OS Roman are used for Croatian, Icelandic, +# Turkish, Romanian, and other encodings. Separate mapping tables +# are available for these encodings. +# +# Before Mac OS 8.5, code point 0xDB was CURRENCY SIGN, and was +# mapped to U+00A4. In Mac OS 8.5 and later versions, code point +# 0xDB is changed to EURO SIGN and maps to U+20AC; the standard +# Apple fonts are updated for Mac OS 8.5 to reflect this. There is +# a "currency sign" variant of the Mac OS Roman encoding that still +# maps 0xDB to U+00A4; this can be used for older fonts. +# +# Before Mac OS 8.5, the ROM bitmap versions of the fonts Chicago, +# New York, Geneva, and Monaco did not implement the full Mac OS +# Roman character set; they only supported character codes up to +# 0xD8. The TrueType versions of these fonts have always implemented +# the full character set, as with the bitmap and TrueType versions +# of the other standard Roman fonts. +# +# In all Mac OS encodings, fonts such as Chicago which are used +# as "system" fonts (for menus, dialogs, etc.) have four glyphs +# at code points 0x11-0x14 for transient use by the Menu Manager. +# These glyphs are not intended as characters for use in normal +# text, and the associated code points are not generally +# interpreted as associated with these glyphs; they are usually +# interpreted (if at all) as the control codes DC1-DC4. +# +# Unicode mapping issues and notes: +# --------------------------------- +# +# The following corporate zone Unicode character is used in this +# mapping: +# +# 0xF8FF Apple logo +# +# NOTE: The graphic image associated with the Apple logo character +# is not authorized for use without permission of Apple, and +# unauthorized use might constitute trademark infringement. +# +# Details of mapping changes in each version: +# ------------------------------------------- +# +# Changes from version n08 to version b02: +# +# - Encoding changed for Mac OS 8.5; change mapping of 0xDB from +# CURRENCY SIGN (U+00A4) to EURO SIGN (U+20AC). +# +# Changes from version n03 to version n04: +# +# - Change mapping of 0xBD from U+2126 to its canonical +# decomposition, U+03A9. +# +################## + +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +# +0x80 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0x81 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0x82 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0x83 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0x84 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0x85 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0x86 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0x87 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0x88 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0x89 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0x8A 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0x8B 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0x8C 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0x8D 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0x8E 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0x8F 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0x90 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0x91 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0x92 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0x93 0x00EC # LATIN SMALL LETTER I WITH GRAVE +0x94 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0x95 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0x96 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0x97 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0x98 0x00F2 # LATIN SMALL LETTER O WITH GRAVE +0x99 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0x9A 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0x9B 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0x9C 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0x9D 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0x9E 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0x9F 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xA0 0x2020 # DAGGER +0xA1 0x00B0 # DEGREE SIGN +0xA2 0x00A2 # CENT SIGN +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A7 # SECTION SIGN +0xA5 0x2022 # BULLET +0xA6 0x00B6 # PILCROW SIGN +0xA7 0x00DF # LATIN SMALL LETTER SHARP S +0xA8 0x00AE # REGISTERED SIGN +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x2122 # TRADE MARK SIGN +0xAB 0x00B4 # ACUTE ACCENT +0xAC 0x00A8 # DIAERESIS +0xAD 0x2260 # NOT EQUAL TO +0xAE 0x00C6 # LATIN CAPITAL LETTER AE +0xAF 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xB0 0x221E # INFINITY +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x2264 # LESS-THAN OR EQUAL TO +0xB3 0x2265 # GREATER-THAN OR EQUAL TO +0xB4 0x00A5 # YEN SIGN +0xB5 0x00B5 # MICRO SIGN +0xB6 0x2202 # PARTIAL DIFFERENTIAL +0xB7 0x2211 # N-ARY SUMMATION +0xB8 0x220F # N-ARY PRODUCT +0xB9 0x03C0 # GREEK SMALL LETTER PI +0xBA 0x222B # INTEGRAL +0xBB 0x00AA # FEMININE ORDINAL INDICATOR +0xBC 0x00BA # MASCULINE ORDINAL INDICATOR +0xBD 0x03A9 # GREEK CAPITAL LETTER OMEGA +0xBE 0x00E6 # LATIN SMALL LETTER AE +0xBF 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xC0 0x00BF # INVERTED QUESTION MARK +0xC1 0x00A1 # INVERTED EXCLAMATION MARK +0xC2 0x00AC # NOT SIGN +0xC3 0x221A # SQUARE ROOT +0xC4 0x0192 # LATIN SMALL LETTER F WITH HOOK +0xC5 0x2248 # ALMOST EQUAL TO +0xC6 0x2206 # INCREMENT +0xC7 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xC8 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xC9 0x2026 # HORIZONTAL ELLIPSIS +0xCA 0x00A0 # NO-BREAK SPACE +0xCB 0x00C0 # LATIN CAPITAL LETTER A WITH GRAVE +0xCC 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE +0xCD 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xCE 0x0152 # LATIN CAPITAL LIGATURE OE +0xCF 0x0153 # LATIN SMALL LIGATURE OE +0xD0 0x2013 # EN DASH +0xD1 0x2014 # EM DASH +0xD2 0x201C # LEFT DOUBLE QUOTATION MARK +0xD3 0x201D # RIGHT DOUBLE QUOTATION MARK +0xD4 0x2018 # LEFT SINGLE QUOTATION MARK +0xD5 0x2019 # RIGHT SINGLE QUOTATION MARK +0xD6 0x00F7 # DIVISION SIGN +0xD7 0x25CA # LOZENGE +0xD8 0x00FF # LATIN SMALL LETTER Y WITH DIAERESIS +0xD9 0x0178 # LATIN CAPITAL LETTER Y WITH DIAERESIS +0xDA 0x2044 # FRACTION SLASH +0xDB 0x20AC # EURO SIGN +0xDC 0x2039 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0xDD 0x203A # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0xDE 0xFB01 # LATIN SMALL LIGATURE FI +0xDF 0xFB02 # LATIN SMALL LIGATURE FL +0xE0 0x2021 # DOUBLE DAGGER +0xE1 0x00B7 # MIDDLE DOT +0xE2 0x201A # SINGLE LOW-9 QUOTATION MARK +0xE3 0x201E # DOUBLE LOW-9 QUOTATION MARK +0xE4 0x2030 # PER MILLE SIGN +0xE5 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xE6 0x00CA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xE7 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xE8 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xE9 0x00C8 # LATIN CAPITAL LETTER E WITH GRAVE +0xEA 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xEB 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xEC 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS +0xED 0x00CC # LATIN CAPITAL LETTER I WITH GRAVE +0xEE 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xEF 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xF0 0xF8FF # Apple logo +0xF1 0x00D2 # LATIN CAPITAL LETTER O WITH GRAVE +0xF2 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xF3 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xF4 0x00D9 # LATIN CAPITAL LETTER U WITH GRAVE +0xF5 0x0131 # LATIN SMALL LETTER DOTLESS I +0xF6 0x02C6 # MODIFIER LETTER CIRCUMFLEX ACCENT +0xF7 0x02DC # SMALL TILDE +0xF8 0x00AF # MACRON +0xF9 0x02D8 # BREVE +0xFA 0x02D9 # DOT ABOVE +0xFB 0x02DA # RING ABOVE +0xFC 0x00B8 # CEDILLA +0xFD 0x02DD # DOUBLE ACUTE ACCENT +0xFE 0x02DB # OGONEK +0xFF 0x02C7 # CARON diff --git a/charsets/tex.rpl b/charsets/tex.rpl new file mode 100644 index 0000000..d9287d3 --- /dev/null +++ b/charsets/tex.rpl @@ -0,0 +1,94 @@ +A1 "!`" +A8 '\\"{}' +AB "<<" +AD "\\-" +AE "\\circled{R}" +B0 "${}^\\circle$" +B1 "$\\pm$" +B2 "${}^2$" +B3 "${}^3$" +B4 "\\'{}" +B5 "$\\mu$" +B7 "$\\cdot$" +B9 "${}^1$" +BA "${}^{0\\over{}}$" +BB ">>" +BC "$1\\over4$" +BD "$1\\over2$" +BE "$3\\over4$" +BF "?`" +C0 "\\`A" +C1 "\\'A" +C2 "\\^A" +C3 "\\~A" +C4 '\\"A' +C5 "\v{A}" +C6 "\\AE{}" +C7 "\\c{C}" +C8 "\\`E" +C9 "\\'E" +CA "\\^E" +CB '\\"E' +CC "\\`I" +CD "\\'I" +CE "\\^I" +CF '\\"I' +D1 "\\~N" +D2 "\\`O" +D3 "\\'O" +D4 "\\^O" +D5 "\\~O" +D6 '\\"O' +D7 "$\\times$" +D8 "\O{}" +D9 "\\`U" +DA "\\'U" +DB "\\~U" +DC '\\"U' +DD "\'Y" +DF "\ss{}" +E0 "\\`a" +E1 "\\'a" +E2 "\\^a" +E3 "\\~a" +E4 '\\"a' +E5 "\\r{a}" +E6 "\\ae{}" +E7 "\\c{c}" +E8 "\\`e" +E9 "\\'e" +EA "\\^e" +EB '\\"e' +EC "\\`{\\i}" +ED "\\'{\\i}" +EE "\\^{\\i}" +EF '\\"{\\i}' +F1 "\\~n" +F2 "\\`o" +F3 "\\'o" +F4 "\\^o" +F5 "\\~o" +F6 '\\"o' +F8 "\\o{}" +F9 "\\`u" +FA "\\'u" +FB "\\^u" +FC '\\"u' +FD "\\'y" +FF '\\"y' +201A "`" +201E "''" +2026 "\\dots" +2020 "\\dagger" +2030 "${}^0/{}_{00}$" +2039 "<" +2018 "`" +2019 "\'" +201C "``" +201D "''" +2022 "\\bullet" +2013 -- +2014 --- +2122 "\trademark" +203A ">" +2116 "\\No" diff --git a/charsets/tex.spc b/charsets/tex.spc new file mode 100644 index 0000000..cc77cdc --- /dev/null +++ b/charsets/tex.spc @@ -0,0 +1,18 @@ +1C "&" +1E "\\\\\n" +23 "\\#" +26 "\\&" +25 "\\%" +24 "\\$" +5F "\\_" +7B "$\{$" +7D "$\}$" +5B "$\[$" +5D "$\]$" +5E "\\asciicircum{}" +5C "$\\backslash$" +7E "\\asciitilde{}" +A9 "\\copyright{}" +AD "\\-" +A0 "~" +FEFF "{}" diff --git a/charsets/us-ascii.txt b/charsets/us-ascii.txt new file mode 100644 index 0000000..cc28e9e --- /dev/null +++ b/charsets/us-ascii.txt @@ -0,0 +1,98 @@ +# +# Name: US-ASCII\ to Unicode +# Unicode version: 1.1 +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE diff --git a/compat/.cvsignore b/compat/.cvsignore new file mode 100644 index 0000000..dd98ed3 --- /dev/null +++ b/compat/.cvsignore @@ -0,0 +1,2 @@ +Makefile +semantic.cache diff --git a/compat/langinfo.c b/compat/langinfo.c new file mode 100644 index 0000000..9bee0e3 --- /dev/null +++ b/compat/langinfo.c @@ -0,0 +1,25 @@ +#include +#include +#include +#include + +static char *badParam=""; +char* __get_dos_codepage(void) { + static char codePageName[10]; + union REGS regs; + regs.x.ax=0x6601; + intdos(®s,®s); + sprintf(codePageName,"cp%d",(regs.x.bx & 0xFFFF)); + if (regs.x.cflag) { + return badParam; + } + return codePageName; +} +char *nl_langinfo(nl_item item) { + if (item == CODESET) { + return __get_dos_codepage(); + } else { + return badParam; + } +} + diff --git a/compat/langinfo.h b/compat/langinfo.h new file mode 100644 index 0000000..ba0240d --- /dev/null +++ b/compat/langinfo.h @@ -0,0 +1,7 @@ +#ifndef LANGINFO_H +#define LANGINFO_H +typedef int nl_item; +#define CODESET 1 + +char *nl_langinfo(nl_item item); +#endif diff --git a/compat/strftime.c b/compat/strftime.c new file mode 100644 index 0000000..d40e1ca --- /dev/null +++ b/compat/strftime.c @@ -0,0 +1,70 @@ +#include +#include +#include +#include "strftime.h" + +static const char *monthAbbr[]={ + "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"}; +size_t strftime(char *s,size_t max, const char *format, + const struct tm *tm) { + size_t i=0,j=0; + while(j=max) return 0; + j+=sprintf(s+j,"%02d",tm->tm_mon+1); + break; + case 'd': if ((j+2) >=max) return 0; + j+=sprintf(s+j,"%02d",tm->tm_mday); + break; + case 'y': if ((j+2) >=max) return 0; + j+=sprintf(s+j,"%02d",tm->tm_year%100); + break; + case 'Y': if ((j+4) >=max) return 0; + j+=sprintf(s+j,"%d",tm->tm_year+1900); + break; + case 'b': if ((j+3)>=max) return 0; + strcpy(s+j,monthAbbr[tm->tm_mon]); + j+=3; + break; + case 'l': if ((j+2) >= max) return 0; + { int hour = tm->tm_hour; + if (hour>12) hour -=12; + if (hour == 0) hour = 12; + j+=sprintf(s+j,"%2d",hour); + break; + } + case 'p': if ((j+2) >= max) return 0; + if (tm->tm_hour >11) { + strcpy(s+j,"PM"); + } else { + strcpy(s+j,"AM"); + } + j+=2; + break; + case 'H': if ((j+2) >= max) return 0; + j+=sprintf(s+j,"%02d",tm->tm_hour); + break; + case 'M':if ((j+2) >= max) return 0; + j+=sprintf(s+j,"%02d",tm->tm_min); + break; + case 'S':if ((j+2) >= max) return 0; + j+=sprintf(s+j,"%02d",tm->tm_sec); + break; + default: + ; + } + i++; + } + } + if (j>=max) return 0; + else { + s[j]=0; + return j; + } +} diff --git a/compat/strftime.h b/compat/strftime.h new file mode 100644 index 0000000..8564de4 --- /dev/null +++ b/compat/strftime.h @@ -0,0 +1,9 @@ +#ifndef STRFTIME_H +#define STRFTIME_H +#include +#include + +size_t strftime(char *s,size_t max, const char *format, + const struct tm *tm); + +#endif diff --git a/compat/unistd.h b/compat/unistd.h new file mode 100644 index 0000000..1d25bad --- /dev/null +++ b/compat/unistd.h @@ -0,0 +1,9 @@ +#ifndef UNISTD_H +#define UNISTD_H +#include +#include +extern int optind; +extern char *optarg; +extern int opterr; +int getopt(int argc, char *argv[], char *optionS); +#endif diff --git a/configure b/configure new file mode 100755 index 0000000..6b0d272 --- /dev/null +++ b/configure @@ -0,0 +1,2022 @@ +#! /bin/sh + +# Guess values for system-dependent variables and create Makefiles. +# Generated automatically using autoconf version 2.13 +# Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc. +# +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. + +# Defaults: +ac_help= +ac_default_prefix=/usr/local +# Any additions from configure.in: +ac_help="$ac_help + --with-wish=fullpath allows to specify full path for Tk interpreter to + avoid checking, which requires X" +ac_help="$ac_help + --with-install-root=path + allows to install catdoc into other directory + than compilied-in path points to" +ac_help="$ac_help + --with-input=charset set default charset to expect in 8-bit word files" +ac_help="$ac_help + --with-output=charset set default charset to output" +ac_help="$ac_help + --disable-charset-check allow make in charsets directory to succeed + even if files for default charsets are not found" +ac_help="$ac_help + --disable-wordview Do not install tcl-tk + wrapper" +ac_help="$ac_help + --disable-langinfo Do not use system +locale for output encoding" + +# Initialize some variables set by options. +# The variables have the same names as the options, with +# dashes changed to underlines. +build=NONE +cache_file=./config.cache +exec_prefix=NONE +host=NONE +no_create= +nonopt=NONE +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +target=NONE +verbose= +x_includes=NONE +x_libraries=NONE +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datadir='${prefix}/share' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +libdir='${exec_prefix}/lib' +includedir='${prefix}/include' +oldincludedir='/usr/include' +infodir='${prefix}/info' +mandir='${prefix}/man' + +# Initialize some other variables. +subdirs= +MFLAGS= MAKEFLAGS= +SHELL=${CONFIG_SHELL-/bin/sh} +# Maximum number of lines to put in a shell here document. +ac_max_here_lines=12 + +ac_prev= +for ac_option +do + + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval "$ac_prev=\$ac_option" + ac_prev= + continue + fi + + case "$ac_option" in + -*=*) ac_optarg=`echo "$ac_option" | sed 's/[-_a-zA-Z0-9]*=//'` ;; + *) ac_optarg= ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case "$ac_option" in + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir="$ac_optarg" ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build="$ac_optarg" ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file="$ac_optarg" ;; + + -datadir | --datadir | --datadi | --datad | --data | --dat | --da) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \ + | --da=*) + datadir="$ac_optarg" ;; + + -disable-* | --disable-*) + ac_feature=`echo $ac_option|sed -e 's/-*disable-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + eval "enable_${ac_feature}=no" ;; + + -enable-* | --enable-*) + ac_feature=`echo $ac_option|sed -e 's/-*enable-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "enable_${ac_feature}='$ac_optarg'" ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix="$ac_optarg" ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he) + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat << EOF +Usage: configure [options] [host] +Options: [defaults in brackets after descriptions] +Configuration: + --cache-file=FILE cache test results in FILE + --help print this message + --no-create do not create output files + --quiet, --silent do not print \`checking...' messages + --version print the version of autoconf that created configure +Directory and file names: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [same as prefix] + --bindir=DIR user executables in DIR [EPREFIX/bin] + --sbindir=DIR system admin executables in DIR [EPREFIX/sbin] + --libexecdir=DIR program executables in DIR [EPREFIX/libexec] + --datadir=DIR read-only architecture-independent data in DIR + [PREFIX/share] + --sysconfdir=DIR read-only single-machine data in DIR [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data in DIR + [PREFIX/com] + --localstatedir=DIR modifiable single-machine data in DIR [PREFIX/var] + --libdir=DIR object code libraries in DIR [EPREFIX/lib] + --includedir=DIR C header files in DIR [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc in DIR [/usr/include] + --infodir=DIR info documentation in DIR [PREFIX/info] + --mandir=DIR man documentation in DIR [PREFIX/man] + --srcdir=DIR find the sources in DIR [configure dir or ..] + --program-prefix=PREFIX prepend PREFIX to installed program names + --program-suffix=SUFFIX append SUFFIX to installed program names + --program-transform-name=PROGRAM + run sed PROGRAM on installed program names +EOF + cat << EOF +Host type: + --build=BUILD configure for building on BUILD [BUILD=HOST] + --host=HOST configure for HOST [guessed] + --target=TARGET configure for TARGET [TARGET=HOST] +Features and packages: + --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) + --enable-FEATURE[=ARG] include FEATURE [ARG=yes] + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --x-includes=DIR X include files are in DIR + --x-libraries=DIR X library files are in DIR +EOF + if test -n "$ac_help"; then + echo "--enable and --with options recognized:$ac_help" + fi + exit 0 ;; + + -host | --host | --hos | --ho) + ac_prev=host ;; + -host=* | --host=* | --hos=* | --ho=*) + host="$ac_optarg" ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir="$ac_optarg" ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir="$ac_optarg" ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir="$ac_optarg" ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir="$ac_optarg" ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst \ + | --locals | --local | --loca | --loc | --lo) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* \ + | --locals=* | --local=* | --loca=* | --loc=* | --lo=*) + localstatedir="$ac_optarg" ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir="$ac_optarg" ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir="$ac_optarg" ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix="$ac_optarg" ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix="$ac_optarg" ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix="$ac_optarg" ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name="$ac_optarg" ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir="$ac_optarg" ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir="$ac_optarg" ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site="$ac_optarg" ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir="$ac_optarg" ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir="$ac_optarg" ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target="$ac_optarg" ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers) + echo "configure generated by autoconf version 2.13" + exit 0 ;; + + -with-* | --with-*) + ac_package=`echo $ac_option|sed -e 's/-*with-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "with_${ac_package}='$ac_optarg'" ;; + + -without-* | --without-*) + ac_package=`echo $ac_option|sed -e 's/-*without-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + eval "with_${ac_package}=no" ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes="$ac_optarg" ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries="$ac_optarg" ;; + + -*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; } + ;; + + *) + if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then + echo "configure: warning: $ac_option: invalid host type" 1>&2 + fi + if test "x$nonopt" != xNONE; then + { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; } + fi + nonopt="$ac_option" + ;; + + esac +done + +if test -n "$ac_prev"; then + { echo "configure: error: missing argument to --`echo $ac_prev | sed 's/_/-/g'`" 1>&2; exit 1; } +fi + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +# File descriptor usage: +# 0 standard input +# 1 file creation +# 2 errors and warnings +# 3 some systems may open it to /dev/tty +# 4 used on the Kubota Titan +# 6 checking for... messages and results +# 5 compiler messages saved in config.log +if test "$silent" = yes; then + exec 6>/dev/null +else + exec 6>&1 +fi +exec 5>./config.log + +echo "\ +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. +" 1>&5 + +# Strip out --no-create and --no-recursion so they do not pile up. +# Also quote any args containing shell metacharacters. +ac_configure_args= +for ac_arg +do + case "$ac_arg" in + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) ;; + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;; + *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*) + ac_configure_args="$ac_configure_args '$ac_arg'" ;; + *) ac_configure_args="$ac_configure_args $ac_arg" ;; + esac +done + +# NLS nuisances. +# Only set these to C if already set. These must not be set unconditionally +# because not all systems understand e.g. LANG=C (notably SCO). +# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'! +# Non-C LC_CTYPE values break the ctype check. +if test "${LANG+set}" = set; then LANG=C; export LANG; fi +if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi +if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi +if test "${LC_CTYPE+set}" = set; then LC_CTYPE=C; export LC_CTYPE; fi + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -rf conftest* confdefs.h +# AIX cpp loses on an empty file, so make sure it contains at least a newline. +echo > confdefs.h + +# A filename unique to this package, relative to the directory that +# configure is in, which we can look for to find out if srcdir is correct. +ac_unique_file=acconfig.h + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then its parent. + ac_prog=$0 + ac_confdir=`echo $ac_prog|sed 's%/[^/][^/]*$%%'` + test "x$ac_confdir" = "x$ac_prog" && ac_confdir=. + srcdir=$ac_confdir + if test ! -r $srcdir/$ac_unique_file; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r $srcdir/$ac_unique_file; then + if test "$ac_srcdir_defaulted" = yes; then + { echo "configure: error: can not find sources in $ac_confdir or .." 1>&2; exit 1; } + else + { echo "configure: error: can not find sources in $srcdir" 1>&2; exit 1; } + fi +fi +srcdir=`echo "${srcdir}" | sed 's%\([^/]\)/*$%\1%'` + +# Prefer explicitly selected file to automatically selected ones. +if test -z "$CONFIG_SITE"; then + if test "x$prefix" != xNONE; then + CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site" + else + CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site" + fi +fi +for ac_site_file in $CONFIG_SITE; do + if test -r "$ac_site_file"; then + echo "loading site script $ac_site_file" + . "$ac_site_file" + fi +done + +if test -r "$cache_file"; then + echo "loading cache $cache_file" + . $cache_file +else + echo "creating cache $cache_file" + > $cache_file +fi + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +ac_exeext= +ac_objext=o +if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then + # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu. + if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then + ac_n= ac_c=' +' ac_t=' ' + else + ac_n=-n ac_c= ac_t= + fi +else + ac_n= ac_c='\c' ac_t= +fi + + +catdoc_version=0.94 +# Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:549: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="gcc" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:579: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_prog_rejected=no + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + break + fi + done + IFS="$ac_save_ifs" +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# -gt 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + set dummy "$ac_dir/$ac_word" "$@" + shift + ac_cv_prog_CC="$@" + fi +fi +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + if test -z "$CC"; then + case "`uname -s`" in + *win32* | *WIN32*) + # Extract the first word of "cl", so it can be a program name with args. +set dummy cl; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:630: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="cl" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + ;; + esac + fi + test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; } +fi + +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6 +echo "configure:662: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5 + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +cat > conftest.$ac_ext << EOF + +#line 673 "configure" +#include "confdefs.h" + +main(){return(0);} +EOF +if { (eval echo configure:678: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + ac_cv_prog_cc_works=yes + # If we can't run a trivial program, we are probably using a cross compiler. + if (./conftest; exit) 2>/dev/null; then + ac_cv_prog_cc_cross=no + else + ac_cv_prog_cc_cross=yes + fi +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + ac_cv_prog_cc_works=no +fi +rm -fr conftest* +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +echo "$ac_t""$ac_cv_prog_cc_works" 1>&6 +if test $ac_cv_prog_cc_works = no; then + { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; } +fi +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6 +echo "configure:704: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5 +echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6 +cross_compiling=$ac_cv_prog_cc_cross + +echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6 +echo "configure:709: checking whether we are using GNU C" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.c <&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then + ac_cv_prog_gcc=yes +else + ac_cv_prog_gcc=no +fi +fi + +echo "$ac_t""$ac_cv_prog_gcc" 1>&6 + +if test $ac_cv_prog_gcc = yes; then + GCC=yes +else + GCC= +fi + +ac_test_CFLAGS="${CFLAGS+set}" +ac_save_CFLAGS="$CFLAGS" +CFLAGS= +echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6 +echo "configure:737: checking whether ${CC-cc} accepts -g" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + echo 'void f(){}' > conftest.c +if test -z "`${CC-cc} -g -c conftest.c 2>&1`"; then + ac_cv_prog_cc_g=yes +else + ac_cv_prog_cc_g=no +fi +rm -f conftest* + +fi + +echo "$ac_t""$ac_cv_prog_cc_g" 1>&6 +if test "$ac_test_CFLAGS" = set; then + CFLAGS="$ac_save_CFLAGS" +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi + +case ${CC} in +*djgpp*) ac_cv_c_bigendian=no + ac_cv_func_setvbuf_reversed=no +;; +*) +;; +esac +echo $ac_n "checking whether byte ordering is bigendian""... $ac_c" 1>&6 +echo "configure:776: checking whether byte ordering is bigendian" >&5 +if eval "test \"`echo '$''{'ac_cv_c_bigendian'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_cv_c_bigendian=unknown +# See if sys/param.h defines the BYTE_ORDER macro. +cat > conftest.$ac_ext < +#include +int main() { + +#if !BYTE_ORDER || !BIG_ENDIAN || !LITTLE_ENDIAN + bogus endian macros +#endif +; return 0; } +EOF +if { (eval echo configure:794: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + # It does; now see whether it defined to BIG_ENDIAN or not. +cat > conftest.$ac_ext < +#include +int main() { + +#if BYTE_ORDER != BIG_ENDIAN + not big endian +#endif +; return 0; } +EOF +if { (eval echo configure:809: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + ac_cv_c_bigendian=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_c_bigendian=no +fi +rm -f conftest* +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 +fi +rm -f conftest* +if test $ac_cv_c_bigendian = unknown; then +if test "$cross_compiling" = yes; then + { echo "configure: error: can not run test program while cross compiling" 1>&2; exit 1; } +else + cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_c_bigendian=no +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_c_bigendian=yes +fi +rm -fr conftest* +fi + +fi +fi + +echo "$ac_t""$ac_cv_c_bigendian" 1>&6 +if test $ac_cv_c_bigendian = yes; then + cat >> confdefs.h <<\EOF +#define WORDS_BIGENDIAN 1 +EOF + +fi + +ac_aux_dir= +for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do + if test -f $ac_dir/install-sh; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install-sh -c" + break + elif test -f $ac_dir/install.sh; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install.sh -c" + break + fi +done +if test -z "$ac_aux_dir"; then + { echo "configure: error: can not find install-sh or install.sh in $srcdir $srcdir/.. $srcdir/../.." 1>&2; exit 1; } +fi +ac_config_guess=$ac_aux_dir/config.guess +ac_config_sub=$ac_aux_dir/config.sub +ac_configure=$ac_aux_dir/configure # This should be Cygnus configure. + +# Find a good install program. We prefer a C program (faster), +# so one script is as good as another. But avoid the broken or +# incompatible versions: +# SysV /etc/install, /usr/sbin/install +# SunOS /usr/etc/install +# IRIX /sbin/install +# AIX /bin/install +# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag +# AFS /usr/afsws/bin/install, which mishandles nonexistent args +# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff" +# ./install, which can be erroneously created by make from ./install.sh. +echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6 +echo "configure:896: checking for a BSD compatible install" >&5 +if test -z "$INSTALL"; then +if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + IFS="${IFS= }"; ac_save_IFS="$IFS"; IFS=":" + for ac_dir in $PATH; do + # Account for people who put trailing slashes in PATH elements. + case "$ac_dir/" in + /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;; + *) + # OSF1 and SCO ODT 3.0 have their own names for install. + # Don't use installbsd from OSF since it installs stuff as root + # by default. + for ac_prog in ginstall scoinst install; do + if test -f $ac_dir/$ac_prog; then + if test $ac_prog = install && + grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then + # AIX install. It has an incompatible calling convention. + : + else + ac_cv_path_install="$ac_dir/$ac_prog -c" + break 2 + fi + fi + done + ;; + esac + done + IFS="$ac_save_IFS" + +fi + if test "${ac_cv_path_install+set}" = set; then + INSTALL="$ac_cv_path_install" + else + # As a last resort, use the slow shell script. We don't cache a + # path for INSTALL within a source directory, because that will + # break other packages using the cache if that directory is + # removed, or if the path is relative. + INSTALL="$ac_install_sh" + fi +fi +echo "$ac_t""$INSTALL" 1>&6 + +# Use test -z because SunOS4 sh mishandles braces in ${var-val}. +# It thinks the first close brace ends the variable substitution. +test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}' + +test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}' + +test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644' + +if test "$enable_wordview" != "no" ;then +# Check whether --with-wish or --without-wish was given. +if test "${with_wish+set}" = set; then + withval="$with_wish" + WISH=$withval +else + +for ac_prog in wish wish8.1 wish8.2 wish8.3 wish8.4 +do +# Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:960: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_WISH'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$WISH" in + /*) + ac_cv_path_WISH="$WISH" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_WISH="$WISH" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_WISH="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +WISH="$ac_cv_path_WISH" +if test -n "$WISH"; then + echo "$ac_t""$WISH" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +test -n "$WISH" && break +done + +if test -n "$WISH"; then +echo $ac_n "checking checking if wish version is 8.1 or above ""... $ac_c" 1>&6 +echo "configure:997: checking checking if wish version is 8.1 or above " >&5 +if eval "test \"`echo '$''{'GOOD_WISH'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + echo 'if {[info tclversion]>=7.6} { + puts -nonewline "yes" +} else { + puts -nonewline "no" +} +exit'>tmp$$.tcl +GOOD_WISH=`$WISH tmp$$.tcl` +rm tmp$$.tcl + +fi + +echo "$ac_t""$GOOD_WISH" 1>&6 +if test "$GOOD_WISH" != yes; then +enable_wordview=no +fi +fi + +fi + +fi +# Check whether --with-install-root or --without-install-root was given. +if test "${with_install_root+set}" = set; then + withval="$with_install_root" + installroot=$withval +fi + +replsuffix=.replchars +specsuffix=.specchars +targetcharset=koi8-r +sourcecharset=cp1251 +# Check whether --with-input or --without-input was given. +if test "${with_input+set}" = set; then + withval="$with_input" + sourcecharset=$withval +else + sourcecharset=cp1251 +fi + +# Check whether --with-output or --without-output was given. +if test "${with_output+set}" = set; then + withval="$with_output" + targetcharset=$withval +else + targetcharset=koi8-r +fi + +if test "$targetcharset" = "utf-8"; then +charsetcheck="$sourcecharset.txt" +else +charsetcheck="$sourcecharset.txt $targetcharset.txt" +fi +# Check whether --enable-charset-check or --disable-charset-check was given. +if test "${enable_charset_check+set}" = set; then + enableval="$enable_charset_check" + if test "$enable_charset_check" = no; +then + charsetcheck="" +fi +fi + +test -z "$manext" && manext=.1 +test -z "$man1dir" && man1dir=\${prefix}/man/man1 +if test -n "$WISH"; then +# Check whether --enable-wordview or --disable-wordview was given. +if test "${enable_wordview+set}" = set; then + enableval="$enable_wordview" + : +fi + +fi +if test "$enable_wordview" = no; then + installtargets=install-catdoc + buildtargets="catdoc xls2csv catppt" +else + installtargets="install-catdoc install-wordview" + buildtargets="catdoc xls2csv catppt wordview" +fi + +echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6 +echo "configure:1080: checking how to run the C preprocessor" >&5 +# On Suns, sometimes $CPP names a directory. +if test -n "$CPP" && test -d "$CPP"; then + CPP= +fi +if test -z "$CPP"; then +if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + # This must be in double quotes, not single quotes, because CPP may get + # substituted into the Makefile and "${CC-cc}" will confuse make. + CPP="${CC-cc} -E" + # On the NeXT, cc -E runs the code through the compiler's parser, + # not just through cpp. + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1101: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP="${CC-cc} -E -traditional-cpp" + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1118: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP="${CC-cc} -nologo -E" + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1135: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP=/lib/cpp +fi +rm -f conftest* +fi +rm -f conftest* +fi +rm -f conftest* + ac_cv_prog_CPP="$CPP" +fi + CPP="$ac_cv_prog_CPP" +else + ac_cv_prog_CPP="$CPP" +fi +echo "$ac_t""$CPP" 1>&6 + +echo $ac_n "checking for ANSI C header files""... $ac_c" 1>&6 +echo "configure:1160: checking for ANSI C header files" >&5 +if eval "test \"`echo '$''{'ac_cv_header_stdc'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +#include +#include +#include +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1173: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + ac_cv_header_stdc=yes +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +if test $ac_cv_header_stdc = yes; then + # SunOS 4.x string.h does not declare mem*, contrary to ANSI. +cat > conftest.$ac_ext < +EOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + egrep "memchr" >/dev/null 2>&1; then + : +else + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. +cat > conftest.$ac_ext < +EOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + egrep "free" >/dev/null 2>&1; then + : +else + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. +if test "$cross_compiling" = yes; then + : +else + cat > conftest.$ac_ext < +#define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +#define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) +int main () { int i; for (i = 0; i < 256; i++) +if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2); +exit (0); } + +EOF +if { (eval echo configure:1240: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + : +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_header_stdc=no +fi +rm -fr conftest* +fi + +fi +fi + +echo "$ac_t""$ac_cv_header_stdc" 1>&6 +if test $ac_cv_header_stdc = yes; then + cat >> confdefs.h <<\EOF +#define STDC_HEADERS 1 +EOF + +fi + +for ac_hdr in unistd.h +do +ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6 +echo "configure:1267: checking for $ac_hdr" >&5 +if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1277: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'` + cat >> confdefs.h <&6 +fi +done + + +if test -z "$enable_langinfo"; then +enable_langinfo=yes +fi +# Check whether --enable-langinfo or --disable-langinfo was given. +if test "${enable_langinfo+set}" = set; then + enableval="$enable_langinfo" + + if test "$enable_langinfo" = yes; then + ac_safe=`echo "langinfo.h" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for langinfo.h""... $ac_c" 1>&6 +echo "configure:1314: checking for langinfo.h" >&5 +if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1324: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 + + echo $ac_n "checking For nl_langinfo(CODESET)""... $ac_c" 1>&6 +echo "configure:1342: checking For nl_langinfo(CODESET)" >&5 +cat > conftest.$ac_ext < +int main() { +nl_langinfo(CODESET); +; return 0; } +EOF +if { (eval echo configure:1351: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + + cat >> confdefs.h <<\EOF +#define HAVE_LANGINFO 1 +EOF + + enable_langinfo=yes + +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + enable_langinfo=no +fi +rm -f conftest* +echo "$ac_t""$enable_langinfo" 1>&6 + +else + echo "$ac_t""no" 1>&6 +enable_langinfo=no +fi + + + fi +fi + + + +echo $ac_n "checking for working const""... $ac_c" 1>&6 +echo "configure:1381: checking for working const" >&5 +if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext <j = 5; +} +{ /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */ + const int foo = 10; +} + +; return 0; } +EOF +if { (eval echo configure:1435: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + ac_cv_c_const=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_c_const=no +fi +rm -f conftest* +fi + +echo "$ac_t""$ac_cv_c_const" 1>&6 +if test $ac_cv_c_const = no; then + cat >> confdefs.h <<\EOF +#define const +EOF + +fi + + +echo $ac_n "checking whether setvbuf arguments are reversed""... $ac_c" 1>&6 +echo "configure:1457: checking whether setvbuf arguments are reversed" >&5 +if eval "test \"`echo '$''{'ac_cv_func_setvbuf_reversed'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + { echo "configure: error: can not run test program while cross compiling" 1>&2; exit 1; } +else + cat > conftest.$ac_ext < +/* If setvbuf has the reversed format, exit 0. */ +main () { + /* This call has the arguments reversed. + A reversed system may check and see that the address of main + is not _IOLBF, _IONBF, or _IOFBF, and return nonzero. */ + if (setvbuf(stdout, _IOLBF, (char *) main, BUFSIZ) != 0) + exit(1); + putc('\r', stdout); + exit(0); /* Non-reversed systems segv here. */ +} +EOF +if { (eval echo configure:1479: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_func_setvbuf_reversed=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_func_setvbuf_reversed=no +fi +rm -fr conftest* +fi + +rm -f core core.* *.core +fi + +echo "$ac_t""$ac_cv_func_setvbuf_reversed" 1>&6 +if test $ac_cv_func_setvbuf_reversed = yes; then + cat >> confdefs.h <<\EOF +#define SETVBUF_REVERSED 1 +EOF + +fi + + +if test "$GCC" = "yes"; then + CFLAGS="-g -O2 -Wall" +fi + +cat >> confdefs.h <> confdefs.h <> confdefs.h <> confdefs.h <> confdefs.h <&6 +echo "configure:1530: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1558: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +fi +done + +for ac_func in strftime +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1585: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1613: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +LIBOBJS="$LIBOBJS ${ac_func}.${ac_objext}" +fi +done + + + + + + + + + + + + + + + + +trap '' 1 2 15 +cat > confcache <<\EOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs. It is not useful on other systems. +# If it contains results you don't want to keep, you may remove or edit it. +# +# By default, configure uses ./config.cache as the cache file, +# creating it if it does not exist already. You can give configure +# the --cache-file=FILE option to use a different cache file; that is +# what configure does when it calls configure scripts in +# subdirectories, so they share the cache. +# Giving --cache-file=/dev/null disables caching, for debugging configure. +# config.status only pays attention to the cache file if you give it the +# --recheck option to rerun configure. +# +EOF +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, don't put newlines in cache variables' values. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +(set) 2>&1 | + case `(ac_space=' '; set | grep ac_space) 2>&1` in + *ac_space=\ *) + # `set' does not quote correctly, so add quotes (double-quote substitution + # turns \\\\ into \\, and sed turns \\ into \). + sed -n \ + -e "s/'/'\\\\''/g" \ + -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p" + ;; + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p' + ;; + esac >> confcache +if cmp -s $cache_file confcache; then + : +else + if test -w $cache_file; then + echo "updating cache $cache_file" + cat confcache > $cache_file + else + echo "not updating unwritable cache $cache_file" + fi +fi +rm -f confcache + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +# Any assignment to VPATH causes Sun make to only execute +# the first set of double-colon rules, so remove it if not needed. +# If there is a colon in the path, we need to keep it. +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[^:]*$/d' +fi + +trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15 + +DEFS=-DHAVE_CONFIG_H + +# Without the "./", some shells look in PATH for config.status. +: ${CONFIG_STATUS=./config.status} + +echo creating $CONFIG_STATUS +rm -f $CONFIG_STATUS +cat > $CONFIG_STATUS </dev/null | sed 1q`: +# +# $0 $ac_configure_args +# +# Compiler output produced by configure, useful for debugging +# configure, is in ./config.log if it exists. + +ac_cs_usage="Usage: $CONFIG_STATUS [--recheck] [--version] [--help]" +for ac_option +do + case "\$ac_option" in + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion" + exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;; + -version | --version | --versio | --versi | --vers | --ver | --ve | --v) + echo "$CONFIG_STATUS generated by autoconf version 2.13" + exit 0 ;; + -help | --help | --hel | --he | --h) + echo "\$ac_cs_usage"; exit 0 ;; + *) echo "\$ac_cs_usage"; exit 1 ;; + esac +done + +ac_given_srcdir=$srcdir +ac_given_INSTALL="$INSTALL" + +trap 'rm -fr `echo "doc/Makefile charsets/Makefile src/Makefile Makefile doc/catdoc.1 doc/xls2csv.1 doc/wordview.1 doc/catppt.1 src/config.h" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15 +EOF +cat >> $CONFIG_STATUS < conftest.subs <<\\CEOF +$ac_vpsub +$extrasub +s%@SHELL@%$SHELL%g +s%@CFLAGS@%$CFLAGS%g +s%@CPPFLAGS@%$CPPFLAGS%g +s%@CXXFLAGS@%$CXXFLAGS%g +s%@FFLAGS@%$FFLAGS%g +s%@DEFS@%$DEFS%g +s%@LDFLAGS@%$LDFLAGS%g +s%@LIBS@%$LIBS%g +s%@exec_prefix@%$exec_prefix%g +s%@prefix@%$prefix%g +s%@program_transform_name@%$program_transform_name%g +s%@bindir@%$bindir%g +s%@sbindir@%$sbindir%g +s%@libexecdir@%$libexecdir%g +s%@datadir@%$datadir%g +s%@sysconfdir@%$sysconfdir%g +s%@sharedstatedir@%$sharedstatedir%g +s%@localstatedir@%$localstatedir%g +s%@libdir@%$libdir%g +s%@includedir@%$includedir%g +s%@oldincludedir@%$oldincludedir%g +s%@infodir@%$infodir%g +s%@mandir@%$mandir%g +s%@CC@%$CC%g +s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%g +s%@INSTALL_SCRIPT@%$INSTALL_SCRIPT%g +s%@INSTALL_DATA@%$INSTALL_DATA%g +s%@WISH@%$WISH%g +s%@CPP@%$CPP%g +s%@LIBOBJS@%$LIBOBJS%g +s%@specsuffix@%$specsuffix%g +s%@replsuffix@%$replsuffix%g +s%@buildtargets@%$buildtargets%g +s%@installtargets@%$installtargets%g +s%@targetcharset@%$targetcharset%g +s%@sourcecharset@%$sourcecharset%g +s%@man1dir@%$man1dir%g +s%@manext@%$manext%g +s%@charsetcheck@%$charsetcheck%g +s%@installroot@%$installroot%g +s%@catdoc_version@%$catdoc_version%g +s%@WORDS_BIGENDIAN DEFS@%$WORDS_BIGENDIAN DEFS%g + +CEOF +EOF + +cat >> $CONFIG_STATUS <<\EOF + +# Split the substitutions into bite-sized pieces for seds with +# small command number limits, like on Digital OSF/1 and HP-UX. +ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script. +ac_file=1 # Number of current file. +ac_beg=1 # First line for current file. +ac_end=$ac_max_sed_cmds # Line after last line for current file. +ac_more_lines=: +ac_sed_cmds="" +while $ac_more_lines; do + if test $ac_beg -gt 1; then + sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file + else + sed "${ac_end}q" conftest.subs > conftest.s$ac_file + fi + if test ! -s conftest.s$ac_file; then + ac_more_lines=false + rm -f conftest.s$ac_file + else + if test -z "$ac_sed_cmds"; then + ac_sed_cmds="sed -f conftest.s$ac_file" + else + ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file" + fi + ac_file=`expr $ac_file + 1` + ac_beg=$ac_end + ac_end=`expr $ac_end + $ac_max_sed_cmds` + fi +done +if test -z "$ac_sed_cmds"; then + ac_sed_cmds=cat +fi +EOF + +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF +for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories. + + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`" + # A "../" for each directory in $ac_dir_suffix. + ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'` + else + ac_dir_suffix= ac_dots= + fi + + case "$ac_given_srcdir" in + .) srcdir=. + if test -z "$ac_dots"; then top_srcdir=. + else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;; + /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;; + *) # Relative path. + srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix" + top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + + case "$ac_given_INSTALL" in + [/$]*) INSTALL="$ac_given_INSTALL" ;; + *) INSTALL="$ac_dots$ac_given_INSTALL" ;; + esac + + echo creating "$ac_file" + rm -f "$ac_file" + configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure." + case "$ac_file" in + *Makefile*) ac_comsub="1i\\ +# $configure_input" ;; + *) ac_comsub= ;; + esac + + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + sed -e "$ac_comsub +s%@configure_input@%$configure_input%g +s%@srcdir@%$srcdir%g +s%@top_srcdir@%$top_srcdir%g +s%@INSTALL@%$INSTALL%g +" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file +fi; done +rm -f conftest.s* + +# These sed commands are passed to sed as "A NAME B NAME C VALUE D", where +# NAME is the cpp macro being defined and VALUE is the value it is being given. +# +# ac_d sets the value in "#define NAME VALUE" lines. +ac_dA='s%^\([ ]*\)#\([ ]*define[ ][ ]*\)' +ac_dB='\([ ][ ]*\)[^ ]*%\1#\2' +ac_dC='\3' +ac_dD='%g' +# ac_u turns "#undef NAME" with trailing blanks into "#define NAME VALUE". +ac_uA='s%^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)' +ac_uB='\([ ]\)%\1#\2define\3' +ac_uC=' ' +ac_uD='\4%g' +# ac_e turns "#undef NAME" without trailing blanks into "#define NAME VALUE". +ac_eA='s%^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)' +ac_eB='$%\1#\2define\3' +ac_eC=' ' +ac_eD='%g' + +if test "${CONFIG_HEADERS+set}" != set; then +EOF +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF +fi +for ac_file in .. $CONFIG_HEADERS; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + echo creating $ac_file + + rm -f conftest.frag conftest.in conftest.out + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + cat $ac_file_inputs > conftest.in + +EOF + +# Transform confdefs.h into a sed script conftest.vals that substitutes +# the proper values into config.h.in to produce config.h. And first: +# Protect against being on the right side of a sed subst in config.status. +# Protect against being in an unquoted here document in config.status. +rm -f conftest.vals +cat > conftest.hdr <<\EOF +s/[\\&%]/\\&/g +s%[\\$`]%\\&%g +s%#define \([A-Za-z_][A-Za-z0-9_]*\) *\(.*\)%${ac_dA}\1${ac_dB}\1${ac_dC}\2${ac_dD}%gp +s%ac_d%ac_u%gp +s%ac_u%ac_e%gp +EOF +sed -n -f conftest.hdr confdefs.h > conftest.vals +rm -f conftest.hdr + +# This sed command replaces #undef with comments. This is necessary, for +# example, in the case of _POSIX_SOURCE, which is predefined and required +# on some systems where configure will not decide to define it. +cat >> conftest.vals <<\EOF +s%^[ ]*#[ ]*undef[ ][ ]*[a-zA-Z_][a-zA-Z_0-9]*%/* & */% +EOF + +# Break up conftest.vals because some shells have a limit on +# the size of here documents, and old seds have small limits too. + +rm -f conftest.tail +while : +do + ac_lines=`grep -c . conftest.vals` + # grep -c gives empty output for an empty file on some AIX systems. + if test -z "$ac_lines" || test "$ac_lines" -eq 0; then break; fi + # Write a limited-size here document to conftest.frag. + echo ' cat > conftest.frag <> $CONFIG_STATUS + sed ${ac_max_here_lines}q conftest.vals >> $CONFIG_STATUS + echo 'CEOF + sed -f conftest.frag conftest.in > conftest.out + rm -f conftest.in + mv conftest.out conftest.in +' >> $CONFIG_STATUS + sed 1,${ac_max_here_lines}d conftest.vals > conftest.tail + rm -f conftest.vals + mv conftest.tail conftest.vals +done +rm -f conftest.vals + +cat >> $CONFIG_STATUS <<\EOF + rm -f conftest.frag conftest.h + echo "/* $ac_file. Generated automatically by configure. */" > conftest.h + cat conftest.in >> conftest.h + rm -f conftest.in + if cmp -s $ac_file conftest.h 2>/dev/null; then + echo "$ac_file is unchanged" + rm -f conftest.h + else + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + fi + rm -f $ac_file + mv conftest.h $ac_file + fi +fi; done + +EOF +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF + +exit 0 +EOF +chmod +x $CONFIG_STATUS +rm -fr confdefs* $ac_clean_files +test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1 + diff --git a/configure.in b/configure.in new file mode 100644 index 0000000..d1b2d4c --- /dev/null +++ b/configure.in @@ -0,0 +1,128 @@ +dnl Process this file with autoconf to produce a configure script. +AC_INIT(acconfig.h) +catdoc_version=0.94.1 +dnl Checks for programs. +AC_PROG_CC +case ${CC} in +*djgpp*) ac_cv_c_bigendian=no + ac_cv_func_setvbuf_reversed=no +;; +*) +;; +esac +AC_C_BIGENDIAN +AC_PROG_INSTALL +if test "$enable_wordview" != "no" ;then +AC_ARG_WITH(wish,[ --with-wish=fullpath allows to specify full path for Tk interpreter to + avoid checking, which requires X], +WISH=$withval,[ +AC_PATH_PROGS(WISH,wish wish8.1 wish8.2 wish8.3 wish8.4) +if test -n "$WISH"; then +AC_CACHE_CHECK(checking if wish version is 8.1 or above ,GOOD_WISH, +changequote(<<,>>)dnl +<>)dnl>> +<=7.6} { + puts -nonewline "yes" +} else { + puts -nonewline "no" +} +exit'>tmp$$.tcl +GOOD_WISH=`$WISH tmp$$.tcl` +rm tmp$$.tcl >> +<> +changequote([, ])dnl +) +if test "$GOOD_WISH" != yes; then +enable_wordview=no +fi +fi +]) +fi +AC_ARG_WITH(install-root,[ --with-install-root=path + allows to install catdoc into other directory + than compilied-in path points to],installroot=$withval,) +replsuffix=.replchars +specsuffix=.specchars +targetcharset=koi8-r +sourcecharset=cp1251 +AC_ARG_WITH(input,[ --with-input=charset set default charset to expect in 8-bit word files],sourcecharset=$withval,sourcecharset=cp1251) +AC_ARG_WITH(output,[ --with-output=charset set default charset to output],targetcharset=$withval,targetcharset=koi8-r) +if test "$targetcharset" = "utf-8"; then +charsetcheck="$sourcecharset.txt" +else +charsetcheck="$sourcecharset.txt $targetcharset.txt" +fi +AC_ARG_ENABLE(charset-check,[ --disable-charset-check allow make in charsets directory to succeed + even if files for default charsets are not found],[if test "$enable_charset_check" = no; +then + charsetcheck="" +fi]) +test -z "$manext" && manext=.1 +test -z "$man1dir" && man1dir=\${prefix}/man/man1 +if test -n "$WISH"; then +AC_ARG_ENABLE(wordview,[ --disable-wordview Do not install tcl-tk + wrapper],[:],) +fi +if test "$enable_wordview" = no; then + installtargets=install-catdoc + buildtargets="catdoc xls2csv catppt" +else + installtargets="install-catdoc install-wordview" + buildtargets="catdoc xls2csv catppt wordview" +fi +dnl Checks for libraries. + +dnl Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS(unistd.h) + +if test -z "$enable_langinfo"; then +enable_langinfo=yes +fi +AC_ARG_ENABLE(langinfo,[ --disable-langinfo Do not use system +locale for output encoding],[ + if test "$enable_langinfo" = yes; then + AC_CHECK_HEADER(langinfo.h,[ + AC_MSG_CHECKING([For nl_langinfo(CODESET)]) +AC_TRY_COMPILE([#include ],[nl_langinfo(CODESET);],[ + AC_DEFINE(HAVE_LANGINFO,1,[Define this if you have XPG4 comliant nl_langinfo, which accepts CODESET argument]) + enable_langinfo=yes + ],[enable_langinfo=no]) +AC_MSG_RESULT([$enable_langinfo]) + ],[enable_langinfo=no]) + + fi],[]) + + +dnl Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST + +dnl Checks for library functions. +AC_FUNC_SETVBUF_REVERSED + +if test "$GCC" = "yes"; then + CFLAGS="-g -O2 -Wall" +fi + +AC_DEFINE_UNQUOTED([SOURCE_CHARSET],"$sourcecharset",[Character encoding used by default for 8-bit source files]) +AC_DEFINE_UNQUOTED([TARGET_CHARSET],"$targetcharset",[Output character encoding used by default, if impossible to determine encoding from locale]) +AC_DEFINE_UNQUOTED([SPEC_EXT],"$specsuffix",[Suffix for files with special symbols map (ones to be replaced regardless of availability in target encoding)]) +AC_DEFINE_UNQUOTED([REPL_EXT],"$replsuffix",[Suffix for symbols replacement map (what to do with symbols, which are not available in the target encoding)]) +AC_DEFINE_UNQUOTED([UNKNOWN_CHAR],"?",[Symbol to represent character which is not available either in target encoding or in replacement map]) +AC_CHECK_FUNCS(strdup strtol) +AC_REPLACE_FUNCS(strftime) +AC_SUBST(specsuffix) +AC_SUBST(replsuffix) +AC_SUBST(buildtargets) +AC_SUBST(installtargets) +AC_SUBST(targetcharset) +AC_SUBST(sourcecharset) +AC_SUBST(man1dir) +AC_SUBST(manext) +AC_SUBST(charsetcheck) +AC_SUBST(installroot) +AC_SUBST(catdoc_version) +AC_SUBST(CFLAGS) +AC_SUBST(WORDS_BIGENDIAN DEFS) +AC_CONFIG_HEADER(src/config.h) +AC_OUTPUT(doc/Makefile charsets/Makefile src/Makefile Makefile doc/catdoc.1 doc/xls2csv.1 doc/wordview.1 doc/catppt.1) diff --git a/doc/.cvsignore b/doc/.cvsignore new file mode 100644 index 0000000..29a1b98 --- /dev/null +++ b/doc/.cvsignore @@ -0,0 +1,8 @@ +Makefile +catdoc.1 +catdoc.txt +catdoc.ps +wordview.1 +xls2csv.1 +xls2csv.ps +xls2csv.txt diff --git a/doc/Makefile.in b/doc/Makefile.in new file mode 100644 index 0000000..a74c77d --- /dev/null +++ b/doc/Makefile.in @@ -0,0 +1,38 @@ + +# Your C compilier and flags +SHELL = /bin/sh +installroot=@installroot@ +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +INSTALL = @INSTALL@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ +bindir = @bindir@ +libdir = @libdir@ +mandir = @man1dir@ +manext = @manext@ +%.txt: %.1 + nroff -mandoc $< |col -bx >$@ +%.ps: %.1 + groff -Tps -mandoc $< >$@ + +all: + touch build + +dosdoc: catdoc.txt catdoc.ps xls2csv.txt xls2csv.ps catppt.txt catppt.txt +clean: + rm build +install: @installtargets@ +install-catdoc: catdoc.1 xls2csv.1 catppt.1 + ../mkinstalldirs $(installroot)$(mandir) + $(INSTALL) -m 644 catdoc.1 $(installroot)$(mandir)/catdoc$(manext) + $(INSTALL) -m 644 xls2csv.1 $(installroot)$(mandir)/xls2csv$(manext) + $(INSTALL) -m 644 catppt.1 $(installroot)$(mandir)/catppt$(manext) +install-wordview: + ../mkinstalldirs $(installroot)$(mandir) + $(INSTALL) -m 644 wordview.1 $(installroot)$(mandir)/wordview$(manext) +# Following rules are primarily for developers use. I doubt that you +# need to rebuild these versions of documentation +distclean: catppt.txt catppt.ps catdoc.1 catdoc.txt catdoc.ps xls2csv.1 xls2csv.txt xls2csv.ps + rm Makefile diff --git a/doc/catdoc.1.in b/doc/catdoc.1.in new file mode 100644 index 0000000..3072022 --- /dev/null +++ b/doc/catdoc.1.in @@ -0,0 +1,314 @@ +.TH catdoc 1 "Version @catdoc_version@" "MS-Word reader" +.SH NAME +catdoc \- reads MS-Word file and puts its content as plain text on standard output +.SH SYNOPSIS + +.BR catdoc " [" -vlu8btawxV "] [" -m " +.IR number ] +[ +.B -s +.IR charset ] +[ +.B -d +.IR charset ] +[ +.B -f +.IR output-format ] +.I file + +.SH DESCRIPTION + +.B catdoc +behaves much like +.BR cat (1) +but it reads MS-Word file and produces human-readable text on standard output. +Optionally it can use +.BR latex (1) +escape sequences for characters which have special meaning for LaTeX. +It also makes some effort to recognize MS-Word tables, although it never +tries to write correct headers for LaTeX tabular environment. Additional +output formats, such is HTML can be easily defined. +.PP +.B catdoc +doesn't attempt to extract formatting information other than tables from +MS-Word document, so different output modes means mainly that different +characters should be escaped and different ways used to represent characters, +missing from output charset. See CHARACTER SUBSTITUTION below + +.PP +.B catdoc +uses internal +.BR unicode (4) +representation of text, so it is able to convert texts when charset in +source document doesn't match charset on target system. +See CHARACTER SETS below. +.PP +If no file names supplied, +.B catdoc +processes its standard input unless it is terminal. It is unlikely that +somebody could type Word document from keyboard, so if +.B catdoc +invoked without arguments and stdin is not redirected, it prints brief +usage message and exits. +Processing of standard input (even among other files) can be forced using +dash '-' as file name. +.PP +By default, +.B catdoc +wraps lines which are more than 72 chars long and separates paragraphs by +blank lines. This behavior can be turned of by +.B -w +switch. In +.I wide +mode +.B catdoc prints each paragraph as one long line, suitable for import into +word processors which perform word wrapping theirselves. + + +.SH OPTIONS +.TP 8 +.B -a +- shortcut for -f ascii. Produces ASCII text as output. +Separates table columns with TAB +.TP 8 +.B -b +- process broken MS-Word file. Normally, +.B catdoc checks if first 8 bytes +of file is Microsoft OLE signature. If so, it processes file, otherwise +it just copies it to stdin. It is intended to use +.B catdoc +as filter for viewing all files with +.I .doc +extension. +.TP 8 +.BI -d charset +- specifies destination charset name. Charset file has format described in +CHARACTER SETS below and should have +.B .txt +extension and reside in +.B catdoc library directory ( @libdir@/catdoc). By default, current +locale charset is used if langinfo support compiled in. +.TP 8 +.BI -f format +- specifies output format as described in CHARACTER SUBSTITUTION below. +.B catdoc +comes with two output formats - ascii and tex. You can add your own if you +wish. +.TP 8 +.B -l +Causes +.B catdoc +to list names of available charsets to the stdout and exit successfully. +.TP 8 +.BI -m number +Specifies right margin for text (default 72). +.B -m 0 +is equivalent to +.B -w +.TP 8 +.BI -s charset +Specifies source charset. (one used in Word document), if Word document +doesn't contain UTF-16 text. When reading rtf documents, it is +typically not necessary, because rtf documents contain ansicpg +specification. But it can be set wrong by Word (I've seen RTF documents +on Russian, where cp1252 was specified). In this case this option would +take precedence over charset, specified in the document. But +source_charset statement in the configuration file have less priority +than charset in the document. +.TP 8 +.B -t +- shortcut for +.B -f tex + converts all printable chars, which have special meaning for +.BR LaTeX (1) +into appropriate control sequences. Separates table columns by +.BR &. +.TP 8 +.B -u +- declares that Word document contain UNICODE (UTF-16) representation +of text (as some Word-97 documents). If catdoc fails to correct Word document +with default charset, try this option. +.TP 8 +.B -8 +- declares is Word document is 8 bit. Just in case that catdoc + recognizes file format incorrectly. +.TP 8 +.B -w +disables word wrapping. By default +.B catdoc +output is splitted into lines not longer than 72 (or number, specified by +-m option) characters and paragraphs +are separated by blank line. With this option each paragraph is one +long line. +.TP 8 +.B -x +causes catdoc to output unknown UNICODE character as \\xNNNN, instead +of question marks. +.TP 8 +.B -v +causes catdoc to print some useless information about word document +structure to stdout before actual start of text. +.TP 8 +.B -V +outputs catdoc version + +.SH CHARACTER SETS +When processing MS-Word file +.B catdoc +uses information about two character sets, typically different + - input and output. They are stored in plain text files in +.B catdoc +library directory. Character set files should contain two whitespace-separated +hexadecimal numbers - 8-bit code in character set and 16-bit Unicode code. +Anything from hash mark to end of line is ignored, as well as blank lines. + +.B catdoc +distribution includes some of these character sets. Additional character set +definitions, directly usable by +.B catdoc +can be obtained from ftp.unicode.org. Charset files have +.B .txt +suffix, which shouldn't be specified in command-line or configuration +files. +.PP +Note that +.B catdoc +is distributed with Cyrillic charsets as default. If you are not +Russian, you probably don't want it, an should reconfigure catdoc at +compile time or in runtime configuration file. +.PP +When dealing with documents with charsets other than default, remember +that Microsoft never uses ISO charsets. While letters in, say cp1252 are +at the same position as in ISO-8859-1, some punctuation signs would be +lost, if you specify ISO-8859-1 as input charset. If you use cp1252, +catdoc would deal with those signs as described in CHARACTER +SUBSTITUTION below. + +.SH CHARACTER SUBSTITUTION +.B catdoc +converts MS-Word file into following internal Unicode representation: +.TP 4 +1. Paragraphs are separated by ASCII Line Feed symbol (0x000A) +.TP 4 +2. Table cells within row are separated by ASCII Field Separator symbol +(0x001C) +.TP 4 +3. Table rows are separated by ASCII Record Separator (0x001E) +.TP 4 +4. All printable characters, including whitespace are represented with their +respective UNICODE codes. +.PP +This UNICODE representation is subsequently converted into 8-bit text in +target character set using following four-step algorithm: +.TP 4 +1. List of special characters is searched for given Unicode character. +If found, then appropriate multi-character sequence is output instead of +character. +.TP 4 +2. If there is an equivalent in target character set, it is output. +.TP 4 +3. Otherwise, replacement list is searched and, if there is multi-character +substitution for this UNICODE char, it is output. +.TP 4 +4. If all above fails, "Unknown char" symbol (question mark) is output. +.PP +Lists of special characters and list of substitution are character +set-independent, because special chars should be escaped regardless of their +existence in target character set (usually, they are parts of US-ASCII, and +therefore exist in any character set) and replacement list is searched only +for those characters, which are not found in target character set. +.PP +These lists are stored in +.B catdoc +library directory in files with prefix of format name. These files have +following format: +.PP +Each line can be either comment (starting with hash mark) or contain +hexadecimal UNICODE value, separated by whitespace from string, which +would be substituted instead of it. If string contain no whitespace it +can be used as is, otherwise it should be enclosed in single or double +quotes. Usual backslash sequences like +.IR '\en' , '\et' +can be used in these string. + + +.SH RUNTIME CONFIGURATION +Upon startup catdoc reads its system-wide configuration file ( +.B catdocrc in +.B catdoc +library directory) and then +user-specific configuration file +.BR ${HOME}/.catdocrc. +.PP +These files can contain following directives: +.TP 8 +.BI "source_charset = " charset-name +Sets default source charset, which would be used if no +.B -s +option specified. Consult configuration of nearby windows +workstation to find one you need. +.TP 8 +.BI "target_charset = " charset-name + Sets default output charset. You probably know, which one you use. +.TP 8 +.BI "charset_path = " directory-list +colon-separated list of directories, which are searched for charset files. +This allows you to install additional charsets in your home directory. +If first directory component of path is ~ it is replaced by contents of +.B HOME +environment variable. +On MS-DOS platform, if directory name starts with %s, it is replaced +with directory of executable file. Empty element in list (i.e. two +consequitve colons) is considered current directory. +.TP 8 +.BI "map_path = " directory-list +colon-separated list of directories, which are searched for special character +map and replacement map. +Same substitution rules as in +.B charset_path +are applied. +.TP 8 +.BI "format = " "format name" +Output format which would be used by default. +.B catdoc +comes with two formats - +.BR ascii " and " tex +but nothing prevents you from writing your own format (set two map files - +special character map and replacement map). +.TP 8 +.BI "unknown_char = " "character specification" +sets character to output instead of unknown Unicode character (default '?') +Character specification can have one of two form - character enclosed in +single quotes or hexadecimal code. +.TP 8 +.BI "use_locale =" "(yes|no)" +Enables or disables automatic selection of output charset (default +.BR yes ), + based on +system locale settings (if enabled at compile time). If automatic +detection is enabled, than output charset settings in the configuration +files (but not in the command line) are ignored, and current system +locale charset is used instead. There are no automatic choice of input +charset, based of locale language, because most modern Word files (since +Word 97) are Unicode anyway + +.SH BUGS + +Doesn't handle +fast-saves properly. Prints footnotes as separate paragraphs at the end of +file, instead of producing correct LaTeX commands. Cannot distinguish +between empty table cell and end of table row. + + + +.SH "SEE ALSO" + +.BR xls2csv (1), +.BR cat (1), +.BR strings (1), +.BR utf (4), +.BR unicode (4) + +.SH AUTHOR + +V.B.Wagner diff --git a/doc/catppt.1.in b/doc/catppt.1.in new file mode 100644 index 0000000..883dc9b --- /dev/null +++ b/doc/catppt.1.in @@ -0,0 +1,58 @@ +.TH ppt2text 1 "Version @catdoc_version@" "MS-PowerPoint reader" +.SH NAME +catppt \- reads MS-PowerPoint file and puts its content on standard output +.SH SYNOPSIS + +.BR "catppt " [ -lV ] +.RB [ -b +.IR " string " ] +.RB [ -s +.IR " charset " ] +.RB [ -d +.IR " charset " ] +.I files + +.SH DESCRIPTION + +.B catppt +reads MS-PowerPoint presentations and dumps its content to stdout. +.SH "OPTIONS" +.TP 8 +.BR -l +list known charsets and exit successfully +.TP 8 +.BI -b string +slides break string. This string (by default - formfeed) would be output +at the end of each slide page. + +.TP 8 +.BI -d charset` +- specifies destination charset name. Charset file has format described in +CHARACTER SETS section of +.BR catdoc (1) +manual page. By default, current locale +charset would be used if langinfo support was enabled at the compile time. + +.TP 8 +.BI -s charset +- specifies source charset. Typically, PowerPoint files use UNICODE +strings with known charsets, but for some reason you may wish to +override it. + +.TP 8 +.B -V +outputs version number + +.SH "SEE ALSO" + +.BR cat (1), +.BR catdoc (1), +.BR xls2csv (1), +.BR strings (1), +.BR utf (4), +.BR unicode (4) + +.SH AUTHOR + +Alex Ott + diff --git a/doc/wordview.1.in b/doc/wordview.1.in new file mode 100644 index 0000000..7ec33ee --- /dev/null +++ b/doc/wordview.1.in @@ -0,0 +1,92 @@ +.TH wordview 1x "Version @catdoc_version@" "MS-Word reader" +.SH NAME +wordview \- displays text contained in MS-Word file in X window + +.SH SYNOPSIS +.BR wordview " [" +.IR filename "]" + +.SH DESCRIPTION + +.B wordview +is simple GUI wrapper around +.BR catdoc (1) +which allows to browse through word file interactively. It doesn't allow +to edit file, but allows to save plain text representation (or version +with some TeX commands) into the file. +.PP +If for some reason +.B catdoc +doesn't recognize file encoding properly, +.B wordview +allows to specify encoding interactively. + +.SH OPTIONS +.B wordview +supports standard X options, supported by +.BR wish (1) + +.SH X RESOURCES +Following X resources can be used to customize +.BR wordview look: + +.TP 8 +.B Wordview.Text.Background +background color for main window +.TP 8 +.B Wordview.Text.Foreground +Foreground color for main window +.TP 8 +.B Wordview.Text.selectBackground +Background color of selected text +.TP 8 +.B Wordview.Text.selectForeground +Foreground color of selected text +.TP 8 +.B Wordview.Text.Font +Font to display text. We recommend to use fixed-width font, such as +Courier, becouse +.BR catdoc (1) +is intended to convert Word into text. Either XLFD font names or +Tk-style font specifications like +.B {Courier 12pt} +can be used for +specifying font. If you use XLFD font names, usage of unicode +(iso10646-1) fonts is recommended. +.TP 8 +.B Wordview.Text.findMode +How to search text. This option can have value either +.BR exact " or " regexp +and specifis whether text is searched for exact match or for regular +expression by default. This behavoir can be toggled interactively via +checkbox in the search dialog. +.TP 8 +.B Wordview.Text.findCase +This boolean option controls whether search is case-sensitive. +Default is no. +.TP 8 +.B Wordview.Menu.highlightBackground +Background color for highlighted menu item +.TP 8 +.B Wordview.Menu.highlightThickness +.TP 8 +.B Wordview.Menu.activeBackground +Background color of active menu item. +.TP 8 +.B Wordview.Menu.activeBorderWidth +Width (in pixels) of border around highlighted menu item. Default +is 0, which differs from Tk global default. See +.BR options (n) +for more details. +.PP +A lot of other resource options which affect behavoir of standard Tk +widgets can affect wordview. See Tcl/Tk manual pages for more +information. + +.SH SEE ALSO +.BR catdoc (1), wish (1), options (n) + +.SH AUTHOR +Victor Wagner . + + diff --git a/doc/xls2csv.1.in b/doc/xls2csv.1.in new file mode 100644 index 0000000..d165eb7 --- /dev/null +++ b/doc/xls2csv.1.in @@ -0,0 +1,101 @@ +.TH xls2csv 1 "Version @catdoc_version@" "MS-Word reader" +.SH NAME +xls2csv \- reads MS-Excel file and puts its content as comma-separated data on standard output +.SH SYNOPSIS + +.BR "xls2csv " [ -xlV ] +.RB [ -f +.IR " format " ] +.RB [ -b +.IR " string " ] +.RB [ -s +.IR " charset " ] +.RB [ -d +.IR " charset " ] +.RB [-q +.IR " number " ] +.RB [ -c +.IR " char" ] +.I files + +.SH DESCRIPTION + +.B xls2csv +reads MS-Excel spreadsheet and dumps its content as comma-separated +values to stdout. Numbers are printed without delimiters, strings are +enclosed in the double quotes. Double-quotes inside string are doubled. +.SH "OPTIONS" +.TP 8 +.BR -x +print unknown Unicode chars as \exNNNN, rather than as question marks +.TP 8 +.BR -l +list known charsets and exit successfully +.TP 8 +.BI -c char +cell separator char. By default - comma. +.TP 8 +.BI -b string +sheet break string. This string (by default - formfeed) would be output +at the end of each workbook page. This string is printed after page +starting at start of line, but no linefeed would be automatically added +at the end of string. Include newline at the ent of sheet separator if +you want it to appear on separate line by itself +.TP 9 +.BI -g number +number of decimal digits in the numbers. By default maximal double +precision (system-dependent macro DBL_DIG) is used. +.TP 8 +.BI -q number +set quote mode. In quote mode 0 cell contents is never quoted. +In quote mode 1 only strings which contain spaces, double quotes or +commas are quoted. +In quote mode 2 (default) all cells with type string are quoted. +In quote mode 3 all cells are quoted. + +.TP 8 +.BI -d charset` +- specifies destination charset name. Charset file has format described in +CHARACTER SETS section of +.BR catdoc (1) +manual page. By default, current locale +charset would be used if langinfo support was enabled at the compile time. + +.TP 8 +.BI -s charset +- specifies source charset. Typically, Excel files have CODE PAGE +record, which denotes input charset, but for some reason you may wish to +override it. +.TP 8 +.BI -f format +- specifies date/time format to use for output of all Excel date and +time values. If this option is not specified, format, specified in +the spreadsheet is used. On POSIX system any format, allowed by +.BR strftime (3) +can be used as value of this option. Under MS-DOS +.B xls2csv +implements limited set of +.B strftime +formats, namely +.BR m ", " d ", " y ", " Y ", " b ", " l ", " p ", " H ", " M ", " S . + +.TP 8 +.B -V +outputs version number + +.SH FILES +${HOME}/.catdocrc, catdoc charset files and substitution map files (see +.BR catdoc (1) +manual page for details, + +.SH "SEE ALSO" + +.BR cat (1), +.BR catdoc (1), +.BR strings (1), +.BR utf (4), +.BR unicode (4) + +.SH AUTHOR + +V.B.Wagner , based on biffview by David Rysdam diff --git a/install-sh b/install-sh new file mode 100755 index 0000000..e843669 --- /dev/null +++ b/install-sh @@ -0,0 +1,250 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/missing b/missing new file mode 100755 index 0000000..cbe2b0e --- /dev/null +++ b/missing @@ -0,0 +1,188 @@ +#! /bin/sh +# Common stub for a few missing GNU programs while installing. +# Copyright (C) 1996, 1997 Free Software Foundation, Inc. +# Franc,ois Pinard , 1996. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + +if test $# -eq 0; then + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 +fi + +case "$1" in + + -h|--h|--he|--hel|--help) + echo "\ +$0 [OPTION]... PROGRAM [ARGUMENT]... + +Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an +error status if there is no known handling for PROGRAM. + +Options: + -h, --help display this help and exit + -v, --version output version information and exit + +Supported PROGRAM values: + aclocal touch file \`aclocal.m4' + autoconf touch file \`configure' + autoheader touch file \`config.h.in' + automake touch all \`Makefile.in' files + bison create \`y.tab.[ch]', if possible, from existing .[ch] + flex create \`lex.yy.c', if possible, from existing .c + lex create \`lex.yy.c', if possible, from existing .c + makeinfo touch the output file + yacc create \`y.tab.[ch]', if possible, from existing .[ch]" + ;; + + -v|--v|--ve|--ver|--vers|--versi|--versio|--version) + echo "missing - GNU libit 0.0" + ;; + + -*) + echo 1>&2 "$0: Unknown \`$1' option" + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 + ;; + + aclocal) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`acinclude.m4' or \`configure.in'. You might want + to install the \`Automake' and \`Perl' packages. Grab them from + any GNU archive site." + touch aclocal.m4 + ;; + + autoconf) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`configure.in'. You might want to install the + \`Autoconf' and \`GNU m4' packages. Grab them from any GNU + archive site." + touch configure + ;; + + autoheader) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`acconfig.h' or \`configure.in'. You might want + to install the \`Autoconf' and \`GNU m4' packages. Grab them + from any GNU archive site." + files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER([^):]*:\([^)]*\)).*/\1/p' configure.in` + if test -z "$files"; then + files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^):]*\)).*/\1/p' configure.in` + test -z "$files" || files="$files.in" + else + files=`echo "$files" | sed -e 's/:/ /g'` + fi + test -z "$files" && files="config.h.in" + touch $files + ;; + + automake) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`Makefile.am', \`acinclude.m4' or \`configure.in'. + You might want to install the \`Automake' and \`Perl' packages. + Grab them from any GNU archive site." + find . -type f -name Makefile.am -print \ + | sed 's/^\(.*\).am$/touch \1.in/' \ + | sh + ;; + + bison|yacc) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.y' file. You may need the \`Bison' package + in order for those modifications to take effect. You can get + \`Bison' from any GNU archive site." + rm -f y.tab.c y.tab.h + if [ $# -ne 1 ]; then + eval LASTARG="\${$#}" + case "$LASTARG" in + *.y) + SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" y.tab.c + fi + SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" y.tab.h + fi + ;; + esac + fi + if [ ! -f y.tab.h ]; then + echo >y.tab.h + fi + if [ ! -f y.tab.c ]; then + echo 'main() { return 0; }' >y.tab.c + fi + ;; + + lex|flex) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.l' file. You may need the \`Flex' package + in order for those modifications to take effect. You can get + \`Flex' from any GNU archive site." + rm -f lex.yy.c + if [ $# -ne 1 ]; then + eval LASTARG="\${$#}" + case "$LASTARG" in + *.l) + SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" lex.yy.c + fi + ;; + esac + fi + if [ ! -f lex.yy.c ]; then + echo 'main() { return 0; }' >lex.yy.c + fi + ;; + + makeinfo) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.texi' or \`.texinfo' file, or any other file + indirectly affecting the aspect of the manual. The spurious + call might also be the consequence of using a buggy \`make' (AIX, + DU, IRIX). You might want to install the \`Texinfo' package or + the \`GNU make' package. Grab either from any GNU archive site." + file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'` + if test -z "$file"; then + file=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` + file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $file` + fi + touch $file + ;; + + *) + echo 1>&2 "\ +WARNING: \`$1' is needed, and you do not seem to have it handy on your + system. You might have modified some files without having the + proper tools for further handling them. Check the \`README' file, + it often tells you about the needed prerequirements for installing + this package. You may also peek at any GNU archive site, in case + some other package would contain this missing \`$1' program." + exit 1 + ;; +esac + +exit 0 diff --git a/mkinstalldirs b/mkinstalldirs new file mode 100755 index 0000000..5b084f5 --- /dev/null +++ b/mkinstalldirs @@ -0,0 +1,40 @@ +#! /bin/sh +# mkinstalldirs --- make directory hierarchy +# Author: Noah Friedman +# Created: 1993-05-16 +# Public domain + +# $Id: mkinstalldirs,v 1.1 2006-02-24 17:44:06 vitus Exp $ + +errstatus=0 + +for file +do + set fnord `echo ":$file" | sed -ne 's/^:\//#/;s/^://;s/\// /g;s/^#/\//;p'` + shift + + pathcomp= + for d + do + pathcomp="$pathcomp$d" + case "$pathcomp" in + -* ) pathcomp=./$pathcomp ;; + esac + + if test ! -d "$pathcomp"; then + echo "mkdir $pathcomp" 1>&2 + + mkdir "$pathcomp" || lasterr=$? + + if test ! -d "$pathcomp"; then + errstatus=$lasterr + fi + fi + + pathcomp="$pathcomp/" + done +done + +exit $errstatus + +# mkinstalldirs ends here diff --git a/src/.cvsignore b/src/.cvsignore new file mode 100644 index 0000000..b5a20ae --- /dev/null +++ b/src/.cvsignore @@ -0,0 +1,18 @@ +*-valgrind.log +Data +Makefile +PRICE_LAN_ALL.XLS +WordDocument +aaa +aaaa +bbbb +catdoc +config.h +core.* +msole-excel.sheet.8-tmp1362961854 +semantic.cache +test-ole +test-ole.c +test-oleparser.c +wordview +xls2csv diff --git a/src/Makefile.in b/src/Makefile.in new file mode 100644 index 0000000..ef4f417 --- /dev/null +++ b/src/Makefile.in @@ -0,0 +1,103 @@ +# Your C compilier and flags +SHELL = /bin/sh + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ +installroot = @installroot@ + +bindir = @bindir@ +libdir = @libdir@ +confdir = @sysconfdir@ +datadir = @datadir@ +mandir = @mandir@ +INSTALL = @INSTALL@ +WISH = @WISH@ +CC = @CC@ +COMPAT_OBJ=@LIBOBJS@ +build_targets = @buildtargets@ +install_targets = @installtargets@ +CHARSETPATH=${datadir}/catdoc +# Flags to pass to your C compilier +# remove -O2 on HP/UX 9. It is known to cause problems +FLAGS=-I. @CFLAGS@ @DEFS@ -DCATDOC_VERSION=\"@catdoc_version@\" -DCHARSETPATH=\"${CHARSETPATH}\" + +#Compile-time configuration of catdoc itself. Edit according to taste + +#Directory, where all system-wide stuff resides +DATA_DIR=$(datadir)/catdoc +#directory, where catdoc binary would be placed +#directory where to put man page + +#System-wide configuration file +SYSTEMRC=$(confdir)/catdocrc + +#Per-user configration file (in user's home directory +USERRC=.catdocrc + +# path where to search for charsets. may be colon-separated list of dirs + +# Extension for map of special character +SPEC_EXT=@specsuffix@ + +# Extension for map of substitutes (chars which are missing from target +# charset) +REPL_EXT=@replsuffix@ + +# Target charset - one used on your system console or xterm +TARGET_CHARSET=@targetcharset@ + +# Source charset - one which your Word files are usially in unless they +# are UNICODE +SOURCE_CHARSET=@sourcecharset@ + +# Character which would be printed instead of ones, not found in charset +# or replacement map +UNKNOWN_CHAR=? + +# END OF COMPILE-TIME CONFIGURATION + +DEFINES=-DSYSTEMRC=\"$(SYSTEMRC)\" -DUSERRC=\"$(USERRC)\" + +CFLAGS=$(FLAGS) $(DEFINES) @DEFS@ + +COMMONOBJ=charsets.o substmap.o fileutil.o confutil.o numutils.o ole.o + +OBJ=catdoc.o reader.o writer.o analyze.o rtfread.o $(COMMONOBJ) + +OBJXLS=xls2csv.o sheet.o xlsparse.o $(COMMONOBJ) + +OBJPPT=catppt.o pptparse.o $(COMMONOBJ) +#.c.o: +# $(CC) -c $(CFLAGS) $* +all: @buildtargets@ +catdoc: $(OBJ) + $(CC) -o catdoc $(OBJ) +xls2csv: $(OBJXLS) + $(CC) -o xls2csv $(OBJXLS) -lm + +catppt: $(OBJPPT) + $(CC) -o catppt $(OBJPPT) -lm + +install: @installtargets@ +install-catdoc:catdoc xls2csv catppt + ../mkinstalldirs $(installroot)$(bindir) + $(INSTALL) -m 755 catdoc $(installroot)$(bindir)/catdoc + $(INSTALL) -m 755 xls2csv $(installroot)$(bindir)/xls2csv + $(INSTALL) -m 755 catppt $(installroot)$(bindir)/catppt +install-wordview: wordview + ../mkinstalldirs $(installroot)$(bindir) + $(INSTALL) -m 755 wordview $(installroot)$(bindir)/wordview +wordview: wordview.tcl + echo "#! $(WISH)" >wordview + echo set charset_lib "\"$(CHARSETPATH)\"">>wordview + cat wordview.tcl >>wordview + chmod 0755 wordview +strftime.o: ../compat/strftime.c + $(CC) -c $(FLAGS) -o $@ $* +clean: + rm -f *.o catdoc wordview xls2csv catppt +distclean: clean + rm Makefile diff --git a/src/analyze.c b/src/analyze.c new file mode 100644 index 0000000..b721ce6 --- /dev/null +++ b/src/analyze.c @@ -0,0 +1,175 @@ +/* + Copyright 1998-2003 Victor Wagner + Copyright 2003 Alex Ott + This file is released under the GPL. Details can be + found in the file COPYING accompanying this distribution. +*/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#include "catdoc.h" +char ole_sign[]={0xD0,0xCF,0x11,0xE0,0xA1,0xB1,0x1A,0xE1,0}; +char rtf_sign[]="{\\rtf"; +char old_word_sign[]={0xdb,0xa5,0}; +char write_sign[]={0x31,0xBE,0}; +int verbose=0; +/********************************************************************* + * Determines format of input file and calls parse_word_header or + * process_file if + * it is word processor file or copy_out if it is plain text file + * return not 0 when error + ********************************************************************/ +int analyze_format(FILE *f) { + unsigned char buffer[129]; + long offset=0; + FILE *new_file, *ole_file; + int ret_code=69; + + if (!signature_check) { + /* forced parsing */ + /* no autodetect possible. Assume 8-bit if not overriden on + * command line */ + if (!get_unicode_char) + get_unicode_char=get_8bit_char; + return process_file(f,LONG_MAX); + } + catdoc_read(buffer,4,1,f); + buffer[4]=0; + if (strncmp(buffer,write_sign,2)==0) { + printf("[Windows Write file. Some garbage expected]\n"); + get_unicode_char=get_8bit_char; + return process_file(f,LONG_MAX); + } else if (strncmp(buffer,rtf_sign,4)==0) { + return parse_rtf(f); + } else if (strncmp(buffer,old_word_sign,2)==0) { + fread(buffer+4,1,124,f); + return parse_word_header(buffer,f,128,0); + } + fread(buffer+4,1,4,f); + if (strncmp(buffer,ole_sign,8)==0) { + if ((new_file=ole_init(f, buffer, 8)) != NULL) { + set_ole_func(); + while((ole_file=ole_readdir(new_file)) != NULL) { + int res=ole_open(ole_file); + if (res >= 0) { + if (strcmp(((oleEntry*)ole_file)->name , "WordDocument") == 0) { + offset=catdoc_read(buffer, 1, 128, ole_file); + ret_code=parse_word_header(buffer,ole_file,-offset,offset); + } + } + ole_close(ole_file); + } + set_std_func(); + ole_finish(); + } else { + fprintf(stderr,"Broken OLE file. Try using -b switch"); + exit(1); + } + } else { + set_std_func(); + copy_out(f,buffer); + return 0; + } + + return ret_code; +} +#define fDot 0x0001 +#define fGlsy 0x0002 +#define fComplex 0x0004 +#define fPictures 0x0008 +#define fEncrypted 0x100 +#define fReadOnly 0x400 +#define fReserved 0x800 +#define fExtChar 0x1000 + +/*******************************************************************/ +/* parses word file info block passed in buffer. + * Determines actual size of text stream and calls process_file + ********************************************************************/ +int parse_word_header(unsigned char * buffer,FILE *f,int offset,long curpos) { + int flags,charset, ret_code=0; + long textstart,textlen,i; + char buf[2]; + + if (verbose) { + printf("File Info block version %d\n",getshort(buffer,2)); + printf("Found at file offset %ld (hex %lx)\n",curpos,curpos); + printf("Written by product version %d\n",getshort(buffer,4)); + printf("Language %d\n",getshort(buffer,6)); + } + flags = getshort(buffer,10); + if (verbose) { + if ((flags & fDot)) { + printf("This is template (DOT) file\n"); + } else { + printf("This is document (DOC) file\n"); + } + if (flags & fGlsy) { + printf("This is glossary file\n"); + } + } + if (flags & fComplex) { + fprintf(stderr,"[This was fast-saved %2d times. Some information is lost]\n", + (flags & 0xF0)>>4); +/* ret_code=69;*/ + } + if (verbose) { + if (flags & fReadOnly) { + printf("File is meant to be read-only\n"); + } + if (flags & fReserved) { + printf("File is write-reserved\n"); + } + } + if (flags & fExtChar) { + if (verbose) { + printf ("File uses extended character set\n"); + } + if (!get_unicode_char) + get_unicode_char=get_word8_char; + + } else if (!get_unicode_char) + get_unicode_char=get_8bit_char; + + if (verbose) { + if (buffer[18]) { + printf("File created on Macintosh\n"); + } else { + printf("File created on Windows\n"); + } + } + if (flags & fEncrypted) { + fprintf(stderr,"[File is encrypted. Encryption key = %08lx]\n", + getlong(buffer,14)); + return 69; + } + if (verbose) { + charset=getshort(buffer,20); + if (charset&&charset !=256) { + printf("Using character set %d\n",charset); + } else { + printf("Using default character set\n"); + } + } + /* skipping to textstart and computing textend */ + textstart=getlong(buffer,24); + textlen=getlong(buffer,28)-textstart; + textstart+=offset; + if (verbose) { + printf ("Textstart = %ld (hex %lx)\n",textstart+curpos,textstart+curpos); + printf ("Textlen = %ld (hex %lx)\n",textlen,textlen); + } + for (i=0;i +#endif + +#include +#include +#include +#include +#include +#include "catdoc.h" + +void help(void); + + +int signature_check = 1; +int forced_charset = 0; /* Flag which disallow rtf parser override charset*/ +int wrap_margin = WRAP_MARGIN; +int (*get_unicode_char)(FILE *f,long *offset,long fileend) =NULL; + +char *input_buffer, *output_buffer; +#ifdef __WATCOMC__ +/* watcom doesn't provide way to access program args via global variable */ +/* so we would hack it ourselves in Borland-compatible way*/ +char **_argv; +int _argc; +#endif +/**************************************************************/ +/* Main program */ +/* Processes options, reads charsets files and substitution */ +/* maps and passes all remaining args to processfile */ +/**************************************************************/ +int main(int argc, char **argv) { + FILE *f; + int c,i; + char *tempname; + short int *tmp_charset; + int stdin_processed=0; +#ifdef __WATCOMC__ + _argv=argv; + _argc=argc; +#endif + read_config_file(SYSTEMRC); +#ifdef USERRC + tempname=find_file(strdup(USERRC),getenv("HOME")); + if (tempname) { + read_config_file(tempname); + free(tempname); + } +#endif +#ifdef HAVE_LANGINFO + get_locale_charset(); +#endif + while ((c=getopt(argc,argv,"Vls:d:f:taubxv8wm:"))!=-1) { + switch (c) { + case 's': + check_charset(&source_csname,optarg); + forced_charset = 1; + break; + case 'd': + check_charset(&dest_csname,optarg); + break; + case 'f': + format_name=strdup(optarg); + break; + case 't': + format_name=strdup("tex"); + break; + case 'a': + format_name=strdup("ascii"); + break; + case 'u': + get_unicode_char = get_word8_char; + break; + case '8': + get_unicode_char = get_8bit_char; + break; + case 'v': + verbose=1; + break; + case 'w': + wrap_margin=0; /* No wrap */ + break; + case 'm': { + char *endptr; + wrap_margin = strtol(optarg,&endptr,0); + if (*endptr) { + fprintf(stderr,"Invalid wrap margin value `%s'\n",optarg); + exit(1); + } + break; + } + case 'l': list_charsets(); exit(0); + case 'b': signature_check =0; break; + case 'x': unknown_as_hex = 1; break; + case 'V': printf("Catdoc Version %s\n",CATDOC_VERSION); + exit(0); + default: + help(); + exit(1); + } + } + input_buffer=malloc(FILE_BUFFER); + if (!input_buffer) { + fprintf(stderr,"Input buffer not allocated\n"); + } + source_charset = read_charset(source_csname); + if (!source_charset) exit(1); + if (strncmp(dest_csname,"utf-8",6)) { + tmp_charset = read_charset(dest_csname); + if (!tmp_charset) exit(1); + target_charset= make_reverse_map(tmp_charset); + free(tmp_charset); + } else { + target_charset = NULL; + } + spec_chars=read_substmap(stradd(format_name,SPEC_EXT)); + if (!spec_chars) { + fprintf(stderr,"Cannot read substitution map %s%s\n",format_name, + SPEC_EXT); + exit(1); + } + replacements=read_substmap(stradd(format_name,REPL_EXT)); + if (!replacements) { + fprintf(stderr,"Cannot read replacement map %s%s\n",format_name, + REPL_EXT); + exit(1); + } + + if (LINE_BUF_SIZE-longest_sequence<=wrap_margin) { + fprintf(stderr,"wrap margin is too large. cannot proceed\n"); + exit(1); + } + if (!isatty(fileno(stdout))) { + output_buffer=malloc(FILE_BUFFER); + if (output_buffer) { + if (setvbuf(stdout,output_buffer,_IOFBF,FILE_BUFFER)) { + perror("stdout"); + } + } else { + fprintf(stderr,"output buffer not allocated\n"); + } + } + set_std_func(); + if (optind == argc) { + if (isatty(fileno(stdin))) { + help(); + exit(0); + } + if (input_buffer) setvbuf(stdin,input_buffer,_IOFBF,FILE_BUFFER); + return analyze_format(stdin); + } + c=0; + for (i=optind;i +#endif + +/* There is some strange thing on aix */ +#if (defined(_AIX)||defined(___AIX)) && !defined(__unix) +# define __unix 1 +#endif + +/* These include files are always available */ +#include +#include + +/* This is our own file */ +#include "ole.h" + +/* + * User customization + * + */ + + +#if defined(__MSDOS__) || defined(_WIN32) +/* MS-DOS doesn't like dot at first char and thinks that suffix + * should be separated by dot. So we'd call personal config catdoc.rc + */ +# define USERRC "catdoc.rc" +/* In DOS, %s in path gets replaced with full path to executable including + trailing backslash. + */ +# ifndef SYSTEMRC +# define SYSTEMRC "%s\\catdoc.rc" +# endif +# ifndef CHARSETPATH +# define CHARSETPATH "%s\\charsets" +# endif +/* Function to add executable directory in place of %s in path. + Not usable in Unix, where executable can have more then one + link and configuration files are usially kept separately from executables + */ +char *add_exe_path(const char* name); +/* Separator of directories in list, such as PATH env var. */ +# define LIST_SEP ';' +/* Separator of levels inside path */ +# define DIR_SEP '\\' +#else +/* On POSIX systems personal configuration files should start with dot*/ +# ifndef USERRC +# define USERRC ".catdocrc" +# endif + +# ifndef SYSTEMRC +# define SYSTEMRC "/usr/local/lib/catdoc/catdocrc" +# endif + +# ifndef CHARSETPATH +# define CHARSETPATH "/usr/local/lib/catdoc" +# endif +/* Macro to add executable directory in place of %s in path. + Not usable in Unix, where executable can have more then one + link and configuration files are usially kept separately from executables + */ +# define add_exe_path(name) name +/* Separator of directories in list, such as PATH env var. */ +# define LIST_SEP ':' +/* Separator of levels inside path */ +#define DIR_SEP '/' +#endif + +/* Charset files distributed with catdoc always have .txt extension*/ +#ifndef CHARSET_EXT +# define CHARSET_EXT ".txt" +#endif + +/* Default charsets */ +#ifndef TARGET_CHARSET +#if defined(__MSDOS__) || defined(_WIN32) +#define TARGET_CHARSET "cp866" +#else +#define TARGET_CHARSET "koi8-r" +#endif +#endif + +#ifndef SOURCE_CHARSET +#define SOURCE_CHARSET "cp1251" +#endif + +#ifndef UNKNOWN_CHAR +#define UNKNOWN_CHAR "?" +#endif +/* On MS-DOS and WIN32 files have to have 3-char extension */ +#if defined(__MSDOS__) || defined(_WIN32) +# ifndef SPEC_EXT +# define SPEC_EXT ".spc" +# endif +# ifndef REPL_EXT +# define REPL_EXT ".rpl" +# endif +#else + +/* On other system we'll rename them to something more readable */ +# ifndef SPEC_EXT +# define SPEC_EXT ".specchars" +# endif +# ifndef REPL_EXT +# define REPL_EXT ".replchars" +# endif +#endif +#if defined(__MSDOS__) && !defined(__DJGPP__) +/* Buffer sizes for 16-bit DOS progran */ +#define PARAGRAPH_BUFFER 16384 +#define FILE_BUFFER 32256 +#define PATH_BUF_SIZE 80 +#else +/* Buffers for 32-bit and more program */ +#define PARAGRAPH_BUFFER 262144 +#define FILE_BUFFER 262144 +#define PATH_BUF_SIZE 1024 +#endif + +/* Buffer for single line. Should be greater than wrap margin + + longest substitution sequence */ +#define LINE_BUF_SIZE 512 +/* Default value for wrap margin */ +#ifndef WRAP_MARGIN +#define WRAP_MARGIN 72 +#endif +/* variable (defined in catdoc.c) which holds actual value of wrap margin*/ +extern int wrap_margin; +/* + * Public types variables and procedures which should be avalable + * to all files in the program + */ + +#ifdef __TURBOC__ +/* Turbo C defines broken isspace, which works only for us-ascii */ +#undef isspace +#define isspace(c) ((unsigned char)(c) <=32) +#endif + +/* Structure to store UNICODE -> target charset mappings */ +/* array of 256 pointers (which may be null) to arrays of 256 short ints + which contain 8-bit character codes or -1 if no matching char */ +typedef short int ** CHARSET; + +/* structure to store multicharacter substitution mapping */ +/* Array of 256 pointers to arrays of 256 pointers to string */ +/* configuration variables defined in catdoc.c */ +typedef char *** SUBSTMAP; + +extern short int *source_charset; +extern char bad_char[]; /* defines one-symbol string to replace unknown unicode chars */ +extern char *source_csname; +extern char *dest_csname; +extern char *format_name; +extern CHARSET target_charset; +extern SUBSTMAP spec_chars; + /* Defines unicode chars which should be + replaced by strings before UNICODE->target chatset + mappigs are applied i.e. TeX special chars like % + */ +extern SUBSTMAP replacements; + /* Defines unicode chars which could be + mapped to some character sequence if no + corresponding character exists in the target charset + i.e copyright sign */ +extern int verbose; /* if true, some additional information would be + printed. defined in analyze.c */ +extern int (*get_unicode_char)(FILE *f,long *offset,long fileend); +/* pointer to function which gets + a char from stream */ + +extern int get_utf16lsb (FILE *f,long *offset,long fileend); +extern int get_utf16msb (FILE *f,long *offset,long fileend); +extern int get_utf8 (FILE *f,long *offset,long fileend); +extern int get_8bit_char (FILE *f,long *offset,long fileend); + +extern int get_word8_char (FILE *f,long *offset,long fileend); + +extern const char *charset_from_codepage(unsigned int codepage); +extern short int *read_charset(const char *filename); +extern CHARSET make_reverse_map (short int *charset); + +extern int to_unicode (short int *charset, int c) ; + +extern int from_unicode (CHARSET charset, int u) ; + +extern char* convert_char(int unicode_char); + +extern char* to_utf8(unsigned int uc); + +extern char* map_path, *charset_path; +extern int signature_check; +extern int unknown_as_hex; +char *find_file(char *name, const char *path); +char *stradd(const char *s1, const char *s2); +void read_config_file(const char *filename); +#ifdef HAVE_LANGINFO +void get_locale_charset(void); +#if defined(HAVE_STRFTIME) && !defined(__TURB0C__) +void set_time_locale(); +#endif +#endif +SUBSTMAP read_substmap(char* filename); +extern int longest_sequence;/* for checking which value of wrap_margin + can cause buffer overflow*/ +char *map_subst(SUBSTMAP map,int uc); + +int check_charset(char **filename,const char *charset); +int process_file(FILE *f,long stop); +void copy_out(FILE *f, char *header); +void output_paragraph(unsigned short int *buffer) ; +int parse_rtf(FILE *f); +/* format recognition*/ +int analyze_format(FILE *f); +void list_charsets(void); +int parse_word_header(unsigned char *buffer,FILE *f,int offset,long curpos); +/* large buffers for file IO*/ +extern char *input_buffer,*output_buffer; +#ifndef HAVE_STRDUP + char *strdup(const char *s); +#endif +/* numeric conversions */ +long int getlong(unsigned char *buffer,int offset); +unsigned long int getulong(unsigned char *buffer,int offset); +unsigned int getshort(unsigned char *buffer,int offset); +#endif diff --git a/src/catdoc.rsp b/src/catdoc.rsp new file mode 100644 index 0000000..8716cc6 --- /dev/null +++ b/src/catdoc.rsp @@ -0,0 +1,3 @@ ++-charsets.obj +-substmap.obj +-reader.obj +-writer.obj +-fileutil.obj & ++-langinfo.obj +-analyze.obj +-confutil.obj +-rtfread.obj +-numutils.obj & ++-sheet.obj +-xlsparse.obj +-ole.obj +-strftime.obj +-pptparse.obj diff --git a/src/catppt.c b/src/catppt.c new file mode 100644 index 0000000..31c8120 --- /dev/null +++ b/src/catppt.c @@ -0,0 +1,157 @@ +/** + * @file ppt2text.c + * @author Alex Ott + * @date 23 äÅË 2004 + * Version: $Id: catppt.c,v 1.1 2006-02-24 17:44:06 vitus Exp $ + * Copyright: Alex Ott + * + * @brief main module for text extracting from .ppt + * + * + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#include +#include +#include "ppt.h" +#include "catdoc.h" +#include +#include "catdoc.h" +#include "float.h" + +#ifdef __TURBOC__ +#define strcasecmp(a,b) strcmpi(a,b) +#endif + +/** + * Displays help message + * + */ +void help (void) { + printf("Usage:\n ppt2text [-lV] [-b string] [-s charset] [-d charset] files\n"); +} + + +char *input_buffer, *output_buffer; + +/** + * + * + * @param argc + * @param argv + * + * @return + */ +int main(int argc, char *argv[]) { + FILE *input; + FILE *new_file, *ole_file; + char *filename =NULL; + short int *tmp_charset; + int c; + int i; + char *tempname; + read_config_file(SYSTEMRC); +#ifdef USERRC + tempname=find_file(strdup(USERRC),getenv("HOME")); + if (tempname) { + read_config_file(tempname); + free(tempname); + } +#endif +#ifdef HAVE_LANGINFO + get_locale_charset(); +#endif + + check_charset(&dest_csname,dest_csname); + + while ((c=getopt(argc,argv,"Vls:d:p:"))!=-1) { + switch(c) { + case 'l': + list_charsets(); exit(0); + case 's': + check_charset(&source_csname,optarg); + source_charset=read_charset(source_csname); + break; + case 'd': + check_charset(&dest_csname,optarg); + break; + case 'V': printf("Catdoc Version %s\n",CATDOC_VERSION); + exit(0); + default: + help(); + exit(1); + } + } + /* If we are using system strftime, we need to set LC_TIME locale + * category unless choosen charset is not same as system locale + */ +#if defined(HAVE_LANGINFO) && defined(HAVE_STRFTIME) && !defined(__TURB0C__) + set_time_locale(); +#endif + /* charset conversion init*/ + input_buffer=malloc(FILE_BUFFER); + if (strcmp(dest_csname,"utf-8")) { + tmp_charset=read_charset(dest_csname); + if (!tmp_charset) { + fprintf(stderr,"Cannot load target charset %s\n",dest_csname); + exit(1); + } + target_charset=make_reverse_map(tmp_charset); + free(tmp_charset); + } else { + target_charset=NULL; + } + spec_chars=read_substmap(stradd("ascii",SPEC_EXT)); + if (!spec_chars) { + fprintf(stderr,"Cannod read substitution map ascii%s\n", + SPEC_EXT); + exit(1); + } + replacements=read_substmap(stradd("ascii",REPL_EXT)); + if (!replacements) { + fprintf(stderr,"Cannod read substitution map ascii%s\n", + REPL_EXT); + exit(1); + } + if (optind>=argc) { + if (isatty(fileno(stdin))) { + help(); + exit(0); + } + do_ppt(stdin,"STDIN"); + exit (0); + } + for (i=optind;iname); */ + if (res >= 0) { + if (strcasecmp(((oleEntry*)ole_file)->name , "PowerPoint Document") == 0) { + do_ppt(ole_file,filename); + } + } + ole_close(ole_file); + } + set_std_func(); + ole_finish(); + fclose(new_file); + } else { + fprintf(stderr, "%s is not OLE file or Error\n", filename); + } + } + return 0; +} diff --git a/src/charsets.c b/src/charsets.c new file mode 100644 index 0000000..b52a1eb --- /dev/null +++ b/src/charsets.c @@ -0,0 +1,302 @@ +/* + Copyright 1998-2003 Victor Wagner + Copyright 2003 Alex Ott + This file is released under the GPL. Details can be + found in the file COPYING accompanying this distribution. +*/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include "catdoc.h" + +char *charset_path=CHARSETPATH; +char *source_csname=SOURCE_CHARSET, *dest_csname=TARGET_CHARSET; +short int * source_charset; +int unknown_as_hex=0; +char bad_char[]=UNKNOWN_CHAR; +CHARSET target_charset; +/************************************************************************/ +/* Converts char in input charset into unicode representation */ +/* Should be converted to macro */ +/************************************************************************/ +int to_unicode (short int *charset, int c) { + return charset[c]; +} +/************************************************************************/ +/* Search inverse charset record for given unicode char and returns */ +/* 0-255 char value if found, -1 otherwise */ +/************************************************************************/ +int from_unicode (CHARSET charset, int u) { + short int *p; + /* This is really assignment, not comparation */ + if ((p=charset[(unsigned)u>>8])) { + return p[u & 0xff]; + } else { + return -1; + } +} +/************************************************************************/ +/* Converts direct (charset -> unicode) to reverse map */ +/************************************************************************/ +CHARSET make_reverse_map(short int *charset) { + CHARSET newmap=calloc(sizeof(short int *), 256); + int i,j,k,l; + short int *p; + if (! charset) { + return NULL; + } + for (i=0;i<256;i++) { + k= charset[i]; + j= (unsigned)k>>8; + if (!newmap[j]) { + newmap[j] = malloc(sizeof(short int *)*256); + if (!newmap[j]) { + fprintf(stderr,"Insufficient memory for charset\n"); + exit(1); + } + for (l=0,p=newmap[j];l<256;l++,p++) *p=-1; + } + p=newmap[j]; + p[k & 0xff]=i; + } + return newmap; +} + +/************************************************************************/ +/* Reads charset file (as got from ftp.unicode.org) and returns array of*/ +/* 256 short ints (malloced) mapping from charset t unicode */ +/************************************************************************/ +short int * read_charset(const char *filename) { + char *path; + FILE *f; + short int *new=calloc(sizeof(short int),256); + int c; + long int uc; + path= find_file(stradd(filename,CHARSET_EXT),charset_path); + if (!path) { + fprintf(stderr,"Cannot load charset %s - file not found\n",filename); + return NULL; + } + f=fopen(path,"rb"); + + if (!f) { + perror(path); + return NULL; + } + if (input_buffer) + setvbuf(f,input_buffer,_IOFBF,FILE_BUFFER); + /* defaults */ + for (c=0;c<32;c++) { + new[c]=c; + } + while (!feof(f)) { + if (fscanf(f,"%i %li",&c,&uc)==2) { + if (c<0||c>255||uc<0||(uc>0xFEFE&& uc!=0xFFFE)) { + fprintf(stderr,"Invalid charset file %s\n",path); + fclose(f); + return NULL; + } + new[c]=uc; + } + while((fgetc(f)!='\n')&&!feof(f)) ; + } + fclose (f); + free(path); + return new; +} + + +/************************************************************************/ +/* Reads 8-bit char and convers it from source charset */ +/************************************************************************/ + +int get_8bit_char (FILE *f,long *offset,long fileend) +{ + unsigned char buf; + if (catdoc_read(&buf, 1, 1, f)==0) return EOF; + (*offset)++; + return to_unicode(source_charset,buf); +} + + +/************************************************************************/ +/* Reads 16-bit unicode value. MS-Word runs on LSB-first machine only, */ +/* so read lsb first always and don't care about proper bit order */ +/************************************************************************/ + +int get_utf16lsb (FILE *f,long *offset,long fileend) { + unsigned char buf[2]; + int result; + result=catdoc_read(buf, 1, 2, f); + if (result<0) { + perror("read:"); + exit(1); + } + if (result !=2) { + return EOF; + } + (*offset)+=2; + return ((int)buf[1])|(((int)buf[0])<<8); +} + +/************************************************************************/ +/* Reads 16-bit unicode value written in MSB order. For processing + * non-word files . */ +/************************************************************************/ +int get_utf16msb (FILE *f,long *offset,long fileend) { + unsigned char buf[2]; + int result; + result=catdoc_read(buf, 1, 2, f); + if (result<0) { + perror("read:"); + exit(1); + } + if (result !=2) { + return EOF; + } + (*offset)+=2; + return ((int)buf[0])|(((int)buf[1])<<8); +} + +int get_utf8 (FILE *f,long *offset,long fileend) { + unsigned char buf[3]; + int d,c; + int result; + result=catdoc_read(buf, 1, 1, f); + if (result<0) { + perror("read"); + exit(1); + } + if (result==0) return EOF; + c=buf[0]; + d=0; + if (c<0x80) + return c; + if (c <0xC0) + return 0xfeff; /*skip corrupted sequebces*/ + if (c <0xE0) { + if (catdoc_read(buf+1, 1, 1, f)<=0) return EOF; + return ((c & 0x1F)<<6 | ((char)buf[1] & 0x3F)); + } + if (c <0xF0) { + if (catdoc_read(buf+1, 1, 2, f)<=2) return (int)EOF; + return ((c & 0x0F)<<12)| + ((buf[1] & 0x3f)<<6)| + (buf[2] & 0x3f); + } + return 0xFEFF; +} + +/**************************************************************************/ +/* Converts unicode char to output charset sequence. Coversion have */ +/* three steps: 1. Replacement map is searched for the character in case */ +/* it is not allowed for output format (% in TeX, < in HTML */ +/* 2. target charset is searched for this unicode char, if it wasn't */ +/* replaced. If not found, then 3. Substitution map is searched */ +/**************************************************************************/ +char *convert_char(int uc) { + static char plain_char[]="a"; /*placeholder for one-char sequences */ + static char hexbuf[8]; + char *mapped; + int c; + if ((mapped=map_subst(spec_chars,uc))) return mapped; + if (target_charset) { + c =from_unicode(target_charset,uc); + if (c>=0) { + *plain_char=c; + return plain_char; + } + if ((mapped = map_subst(replacements,uc))) return mapped; + if (unknown_as_hex) { + sprintf(hexbuf,"\\x%04X",(unsigned)uc); + /* This sprintf is safe, becouse uc is unicode character code, + which cannot be greater than 0xFFFE. It is ensured by routines + in reader.c + */ + return hexbuf; + } + return bad_char; + } else { + /* NULL target charset means UTF-8 output */ + return to_utf8(uc); + } +} +/******************************************************************/ +/* Converts given unicode character to the utf-8 sequence */ +/* in the static string buffer. Buffer wouldbe overwritten upon */ +/* next call */ +/******************************************************************/ +char *to_utf8(unsigned int uc) { + static char utfbuffer[4]; /* it shouldn't overflow becouse we never deal + with chars greater than 65535*/ + int count=0; + if (uc< 0x80) { + utfbuffer[0]=uc; + count=1; + } else { + if (uc < 0x800) { + utfbuffer[count++]=0xC0 | (uc >> 6); + } else { + utfbuffer[count++]=0xE0 | (uc >>12); + utfbuffer[count++]=0x80 | ((uc >>6) &0x3F); + } + utfbuffer[count++]=0x80 | (uc & 0x3F); + } + utfbuffer[count]=0; + return utfbuffer; +} + +struct cp_map { + int codepage; + char *charset_name; +}; + +struct cp_map cp_to_charset [] = { + {10000,"mac-roman"}, + {10001,"mac-japanese"}, + {10002,"mac-tchinese"}, + {10003,"mac-korean"}, + {10004,"mac-arabic"}, + {10005,"mac-hebrew"}, + {10006,"mac-greek1"}, + {10007,"mac-cyrillic"}, + {10008,"mac-schinese"}, + {10010,"mac-romania"}, + {10017,"mac-ukraine"}, + {10021,"mac-thai"}, + {10029,"mac-centeuro"}, + {10079,"mac-iselandic"}, + {10081,"mac-turkish"}, + {10082,"mac-croatia"}, + {20866,"koi8-r"}, + {28591,"8859-1"}, + {28592,"8859-2"}, + {28593,"8859-3"}, + {28594,"8859-4"}, + {28595,"8859-5"}, + {28596,"8859-6"}, + {28597,"8859-7"}, + {28598,"8859-8"}, + {28599,"8859-9"}, + {28605,"8859-15"}, + {65001,"utf-8"}, + {0,NULL}}; +const char *charset_from_codepage(unsigned int codepage) { + + static char buffer[7]; + struct cp_map *cp; + if (codepage==1200||codepage==1201) { + /* For UCS2 */ + return ""; + } else + if (codepage<10000) { + sprintf(buffer,"cp%d",codepage); + return buffer; + } else { + for (cp = cp_to_charset;cp->codepage!=0&& cp->codepage!=codepage;cp++); + return cp->charset_name; + } +} diff --git a/src/config.h.in b/src/config.h.in new file mode 100644 index 0000000..4175b63 --- /dev/null +++ b/src/config.h.in @@ -0,0 +1,47 @@ +/* src/config.h.in. Generated automatically from configure.in by autoheader 2.13. */ + +/* Define to empty if the keyword does not work. */ +#undef const + +/* Define if the setvbuf function takes the buffering type as its second + argument and the buffer pointer as the third, as on System V + before release 3. */ +#undef SETVBUF_REVERSED + +/* Define if you have the ANSI C header files. */ +#undef STDC_HEADERS + +/* Define if your processor stores words with the most significant + byte first (like Motorola and SPARC, unlike Intel and VAX). */ +#undef WORDS_BIGENDIAN + +/* Define if you have the strdup function. */ +#undef HAVE_STRDUP + +/* Define if you have the strftime function. */ +#undef HAVE_STRFTIME + +/* Define if you have the strtol function. */ +#undef HAVE_STRTOL + +/* Define if you have the header file. */ +#undef HAVE_UNISTD_H + +/* Define this if you have XPG4 comliant nl_langinfo, which accepts CODESET argument */ +#undef HAVE_LANGINFO + +/* Character encoding used by default for 8-bit source files */ +#undef SOURCE_CHARSET + +/* Output character encoding used by default, if impossible to determine encoding from locale */ +#undef TARGET_CHARSET + +/* Suffix for files with special symbols map (ones to be replaced regardless of availability in target encoding) */ +#undef SPEC_EXT + +/* Suffix for symbols replacement map (what to do with symbols, which are not available in the target encoding) */ +#undef REPL_EXT + +/* Symbol to represent character which is not available either in target encoding or in replacement map */ +#undef UNKNOWN_CHAR + diff --git a/src/confutil.c b/src/confutil.c new file mode 100644 index 0000000..1bfe199 --- /dev/null +++ b/src/confutil.c @@ -0,0 +1,171 @@ +/* + Copyright 1998-2003 Victor Wagner + Copyright 2003 Alex Ott + This file is released under the GPL. Details can be + found in the file COPYING accompanying this distribution. +*/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#ifdef HAVE_LANGINFO +#include +#ifndef __TURBOC__ +#include +#endif +#endif +#include "catdoc.h" + +char *format_name="ascii"; +static int runtime_locale_check=1; +/********************************************************************/ +/* Reads configuration file */ +/* */ +/********************************************************************/ +void read_config_file(const char* filename) { + FILE *f=fopen(add_exe_path(filename),"rb"); + char *name,*value,line[1024],*c; + int lineno=0; + if (!f) return; + while (!feof(f)) { + fgets(line,1024,f); + if (feof(f)) break; + lineno++; + if ((c=strchr(line,'#'))) *c='\0'; + name=line; + while (*name&&isspace(*name)) name++; + if (!*name) continue; + for (value=name;*value&&(isalnum(*value)||*value=='_'); value++); + if (*value=='=') { + *value=0;value++; + } else { + *value=0;value++; + while(*value&&isspace(*value)) value++; + if (*value++ != '=' ) { + fprintf(stderr,"Error %s(%d): name = value syntax expected\n", + filename,lineno); + exit(1); + } + while(*value&&isspace(*value)) value++; + } + for (c=value;*c&&!isspace(*c);c++); + if (value==c) { + fprintf(stderr,"Error %s(%d): name = value syntax expected\n", + filename,lineno); + exit(1); + } + *c=0; + if (!strcmp(name,"source_charset")) { + source_csname=strdup(value); + } else if (!strcmp(name,"target_charset")) { + dest_csname=strdup(value); + } else if (!strcmp(name,"format")) { + format_name=strdup(value); + } else if (!strcmp(name,"charset_path")) { + charset_path=strdup(value); + } else if (!strcmp(name,"map_path")) { + map_path = strdup(value); + } else if (!strcmp(name,"unknown_char")) { + if (*value=='"' && value[1] && value[2]=='"') value++; + if (*value=='\'' && value[1] && value[2]=='\'') value++; + bad_char[0] = *value; + } else if (!strcmp(name,"use_locale")) { + if (tolower(value[0])=='n') { + runtime_locale_check=0; + } else if (tolower(value[0])=='y') { + runtime_locale_check=1; + } else { + fprintf(stderr,"Error %s(%d): use_locale requires 'yes' or 'no'\n", + filename,lineno); + exit(1); + } + + } else { + fprintf(stderr,"Invalid configuration directive in %s(%d):,%s = %s\n", + filename,lineno,name,value); + exit(1); + } + } + fclose(f); +} +#ifdef HAVE_LANGINFO +static char *locale_charset = NULL; +/*********************************************************************/ +/* Determines output character set from current locale and puts it * + * into global variable dest_csname * + *********************************************************************/ +void get_locale_charset() { + char *codeset; + if (!runtime_locale_check) return; +#ifndef __TURBOC__ + if (!setlocale(LC_CTYPE,"")) return; +#endif + codeset = nl_langinfo(CODESET); + if (!strncmp(codeset,"ISO",3)||!strncmp(codeset,"iso",3)) { + codeset+=3; + if (*codeset=='-') codeset++; + if (!strncmp(codeset,"646",3)) { + /* ISO 646 is another name for us=ascii */ + check_charset(&dest_csname,"us-ascii") ; + } else { + if (check_charset(&dest_csname,codeset)) { + locale_charset = dest_csname; + } + } + } else if (!strcmp(codeset,"ANSI_X3.4-1968")) { + check_charset(&dest_csname,"us-ascii"); + } else if (!strncmp(codeset,"ANSI",4)||!strncmp(codeset,"ansi",4)) { + char *newstr; + if (*codeset=='-') { + codeset++; + } + newstr = malloc(strlen(codeset)-4+2+1); + strcpy(newstr,"cp"); + strcpy(newstr+2,codeset+4); + if (check_charset(&dest_csname,newstr)) { + locale_charset = dest_csname; + } + free(newstr); + } else if (!strncmp(codeset,"IBM",3)) { + char *newstr; + codeset+=3; + if (*codeset == '-') codeset++; + newstr=malloc(strlen(codeset)+2+1); + strcpy(newstr,"cp"); + strcpy(newstr+2,codeset); + if (check_charset(&dest_csname, newstr)) { + locale_charset=dest_csname; + } + free(newstr); + } else { + char *i,*newstr = strdup(codeset); + for (i=newstr;*i;i++) { + *i=tolower(*i); + } + if (check_charset(&dest_csname,newstr)) { + locale_charset = dest_csname; + } + } + +} +#ifndef __TURBOC__ +void set_time_locale() { + if (!runtime_locale_check) return; + if (!locale_charset) return; + if (strcmp(locale_charset,dest_csname)!=0) return; + setlocale(LC_TIME,""); +} +#endif +#endif +#ifndef HAVE_STRDUP +/* Implementation of strdup for systems which don't have it */ +char *strdup(const char *s) { + int size=strlen(s); + char *newstr; + newstr=malloc(size+1); + return strcpy(newstr,s); +} +#endif diff --git a/src/fileutil.c b/src/fileutil.c new file mode 100644 index 0000000..d902a6a --- /dev/null +++ b/src/fileutil.c @@ -0,0 +1,265 @@ +/* + Copyright 1998-2003 Victor Wagner + Copyright 2003 Alex Ott + This file is released under the GPL. Details can be + found in the file COPYING accompanying this distribution. +*/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#include +#include "catdoc.h" +#if defined(MSDOS) && !defined(__MSDOS__) +#define __MSDOS__ +#endif +#if defined(__MSDOS__) || defined(_WIN32) +#include +#include +#else +#include +#endif + + +/************************************************************************/ +/* Copies component of string starting with p and ending one char */ +/* before q into path_buf, expanding ~ if neccessary */ +/************************************************************************/ +int prepare_path_buf(char *path_buf, const char *start, const char *end) { + if (*start == '~' && start[1] == DIR_SEP) { + char *home=getenv("HOME"); + start++; + if (!home) { + if (end-start>PATH_BUF_SIZE) return 0; + strncpy(path_buf,start,end-start); + path_buf[end-start]=0; + } else { + int l = strlen(home); + if (l+(end-start)>PATH_BUF_SIZE) return 0; + strcpy(path_buf,home); + strncpy(path_buf+l,start,end-start); + path_buf[end-start+l]=0; + } + } else { + if (end-start>PATH_BUF_SIZE) return 0; + strncpy(path_buf,start,end-start); + path_buf[end-start]=0; + } + /* Empty list element means current directory */ + if (!*path_buf) { + path_buf[0]='.'; + path_buf[1]=0; +#ifdef __MSDOS__ + } else { + strcpy(path_buf,add_exe_path(path_buf)); /* safe, becouse + add_exe_path knows about PATH_BUF_SIZE */ +#endif + } + return 1; + +} +/************************************************************************/ +/* Searches for file name in specified list of directories. Sets */ +/* Returns dynamically allocated full path or NULL. if nothing */ +/* appropriate Expects name to be dynamically allocated and frees it */ +/************************************************************************/ +char *find_file(char *name, const char *path) +{ const char *p; + char *q; + char path_buf[PATH_BUF_SIZE]; + char dir_sep[2]={DIR_SEP,0}; + for (p=path;p;p=q+1) { + q=strchr(p,LIST_SEP); + + if (q) { + if (!prepare_path_buf(path_buf,p,q)) continue; + } else { + q--; + if (!prepare_path_buf(path_buf,p,p+strlen(p))) continue; + } + strcat(path_buf,dir_sep); /* always one char */ + if (strlen(path_buf)+strlen(name)>=PATH_BUF_SIZE) + continue; /* Ignore too deeply nested directories */ + strcat(path_buf,name); + if (access(path_buf,0)==0) { + free(name); + return strdup(path_buf); + } + } + /* if we are here, nothing found */ + free(name); + return NULL; +} + +/************************************************************************/ +/* Searches for charset with given name and put pointer to malloced copy*/ +/* of its name into first arg if found. Otherwise leaves first arg */ +/* unchanged. Returns non-zero on success */ +/************************************************************************/ +int check_charset(char **filename,const char *charset) { + char *tmppath; + if (!strncmp(charset,"utf-8",6)) { + *filename=strdup("utf-8"); + return 1; + } + tmppath=find_file(stradd(charset,CHARSET_EXT),charset_path); + if (tmppath&& *tmppath) { + *filename=strdup(charset); + free(tmppath); + return 1; + } + return 0; +} + +/**********************************************************************/ +/* Returns malloced string containing concatenation of two */ +/* arguments */ +/**********************************************************************/ +char *stradd(const char *s1,const char *s2) +{ char *res; + res=malloc(strlen(s1)+strlen(s2)+1); + if (!res) { + fprintf (stderr,"Out of memory!"); + exit(1); + } + strcpy(res,s1); + strcat(res,s2); + return res; +} + + +/* + * In DOS, argv[0] contain full path to the program, and it is a custom + * to keep configuration files in same directory as program itself + */ +#ifdef __MSDOS__ +char *exe_dir(void) { + static char pathbuf[PATH_BUF_SIZE]; + char *q; + strcpy(pathbuf,_argv[0]); /* DOS ensures, that our exe path is no + longer than PATH_BUF_SIZE*/ + q=strrchr(pathbuf,DIR_SEP); + if (q) { + *q=0; + } else { + pathbuf[0]=0; + } + return pathbuf; +} +char *add_exe_path(const char *name) { + static char path[PATH_BUF_SIZE]; + char *mypath=exe_dir(); + /* No snprintf in Turbo C 2.0 library, so just check by hand + and exit if something goes wrong */ + if (strchr(name,'%')) { + /* there is substitution */ + if (strlen(name)-1+strlen(mypath)>=PATH_BUF_SIZE) { + fprintf(stderr,"Invalid config file. file name \"%s\" too long " + "after substitution\n",name); + exit(1); + } + sprintf(path,name,exe_dir()); + return path; + } else { + return name; + } +} +#endif +/*********************************************************************/ +/* Prints out list of available charsets, i.e. names without extension * + * of all .txt files in the charset path + internally-supported utf-8 * + ************************************************************************/ + +void list_charsets(void) { + const char *p; + char *q; + char path_buf[PATH_BUF_SIZE]; + char dir_sep[2]={DIR_SEP,0}; +#ifdef __MSDOS__ + struct ffblk ffblock; + int res,col; +#else + glob_t glob_buf; + int count,glob_flags=GLOB_ERR; +#endif + char **ptr; + for (p=charset_path;p;p=q+1) { + q=strchr(p,LIST_SEP); + + if (q) { + if (q-p>=PATH_BUF_SIZE) { + /* Oops, dir name too long, perhabs broken config file */ + continue; + } + strncpy(path_buf,p,q-p); + path_buf[q-p]=0; + } else { + q--; + if (strlen(p)>=PATH_BUF_SIZE) continue; + strcpy(path_buf,p); + } + /* Empty list element means current directory */ + if (!*path_buf) { + path_buf[0]='.'; + path_buf[1]=0; +#ifdef __MSDOS__ + } else { + strcpy(path_buf,add_exe_path(path_buf)); /* safe, becouse + add_exe_path knows about PATH_BUF_SIZE */ +#endif + } + strcat(path_buf,dir_sep); /* always one char */ + if (strlen(path_buf)+6>=PATH_BUF_SIZE) + continue; /* Ignore too deeply nested directories */ + strcat(path_buf,"*.txt"); +#ifdef __MSDOS__ + res=findfirst(path_buf,&ffblock,FA_RDONLY | FA_HIDDEN | FA_ARCH); + col=1; + printf("Available charsets:\n"); + while (!res) { + char name[12],*src,*dest; + dest=name; + src=ffblock.ff_name; + for (dest=name,src=ffblock.ff_name;*src && *src !='.';dest++,src++) + *dest=tolower(*src); + *dest++=(col<5)?'\t':'\n'; + if (++col>5) col=1; + *dest=0; + printf("%10s",name); + res=findnext(&ffblock); + } +#else + switch (glob(path_buf,glob_flags,NULL,&glob_buf)) { + case 0: +#ifdef GLOB_NOMATCH + case GLOB_NOMATCH: +#endif + break; + default: + perror("catdoc"); + exit(1); + } + glob_flags|=GLOB_APPEND; +#endif + } +#ifdef __MSDOS__ + fputs("utf-8\n",stdout); +#else + count=0;printf("Available charsets:"); + for (ptr=glob_buf.gl_pathv;*ptr;ptr++) { + printf("%c",(count++)%5?'\t':'\n'); + p=strrchr(*ptr,dir_sep[0]); + if (!p) continue; + p++; + if ((q=strchr(p,'.'))) *q=0; + fputs(p,stdout); + } + printf("%c",(count++)%5?'\t':'\n'); + fputs("utf-8",stdout); + printf("\n"); + globfree(&glob_buf); +#endif +} diff --git a/src/makefile.tc b/src/makefile.tc new file mode 100644 index 0000000..53f8df1 --- /dev/null +++ b/src/makefile.tc @@ -0,0 +1,25 @@ +CC=tcc +TCDIR=c:\tc +LIB=$(TCDIR)\lib +CFLAGS=-v -w -mc -DHAVE_LANGINFO -DHAVE_STRDUP -DCATDOC_VERSION="0.94.1" +OBJ=charsets.obj substmap.obj reader.obj writer.obj fileutil.obj langinfo.obj analyze.obj confutil.obj rtfread.obj numutils.obj sheet.obj xlsparse.obj ole.obj strftime.obj pptparse.obj +.c.obj: + $(CC) -v -c -mc -I../compat $(CFLAGS) $* +all: catdoc.exe xls2csv.exe catppt.exe +catdoc.exe: catdoc.obj catdoc.lib + $(CC) -v -mc catdoc.obj catdoc.lib +xls2csv.exe: xls2csv.obj catdoc.lib + $(CC) -v -mc xls2csv.obj catdoc.lib +catppt.exe: catppt.obj catdoc.lib + $(CC) -v -mc catppt.obj catdoc.lib +langinfo.obj: ..\compat\langinfo.c + $(CC) -c -I../compat $(CFLAGS) ../compat/langinfo.c +strftime.obj: ..\compat\strftime.c + $(CC) -c -I../compat $(CFLAGS) ../compat/strftime.c + +catdoc.lib: $(OBJ) catdoc.rsp + tlib /E catdoc @catdoc.rsp +clean: + del *.obj + del *.exe + del *.lib diff --git a/src/numutils.c b/src/numutils.c new file mode 100644 index 0000000..9376f35 --- /dev/null +++ b/src/numutils.c @@ -0,0 +1,29 @@ +/*****************************************************************/ +/* Utilities to convert various numeric types from the Windows */ +/* (Little endian) format to native types */ +/* */ +/* This file is part of catdoc project */ +/* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003 */ +/*****************************************************************/ + + +/********************************************************************/ +/* Reads 2-byte LSB int from buffer at given offset platfom-indepent + * way + *********************************************************************/ +unsigned int getshort(unsigned char *buffer,int offset) { + return (unsigned short int)buffer[offset]|((unsigned short int)buffer[offset+1]<<8); +} +/********************************************************************/ +/* Reads 4-byte LSB int from buffer at given offset almost platfom-indepent + * way + *********************************************************************/ +long int getlong(unsigned char *buffer,int offset) { + return (long)buffer[offset]|((long)buffer[offset+1]<<8L) + |((long)buffer[offset+2]<<16L)|((long)buffer[offset+3]<<24L); +} + +unsigned long int getulong(unsigned char *buffer,int offset) { + return (unsigned long)buffer[offset]|((unsigned long)buffer[offset+1]<<8L) + |((unsigned long)buffer[offset+2]<<16L)|((unsigned long)buffer[offset+3]<<24L); +} diff --git a/src/ole.c b/src/ole.c new file mode 100644 index 0000000..ff21a30 --- /dev/null +++ b/src/ole.c @@ -0,0 +1,626 @@ +/** + * @file ole.c + * @author Alex Ott, Victor B Wagner + * @date Wed Jun 11 12:33:01 2003 + * Version: $Id: ole.c,v 1.1 2006-02-24 17:44:06 vitus Exp $ + * Copyright: Victor B Wagner, 1996-2003 Alex Ott, 2003 + * + * @brief Parsing structure of MS Office compound document + * + * This file is part of catdoc project + * and distributed under GNU Public License + * + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include + +#include "catdoc.h" + +#define min(a,b) ((a) < (b) ? (a) : (b)) + +long int sectorSize, shortSectorSize; +/* BBD Info */ +long int bbdNumBlocks; +unsigned char *BBD=NULL; +/* SBD Info */ +long int sbdNumber, sbdStart, sbdLen; +unsigned char *SBD=NULL; +oleEntry *rootEntry=NULL; +/* Properties Info */ +long propCurNumber, propLen, propNumber, propStart; +unsigned char *properties=NULL; +long int fileLength=0; + +static unsigned char ole_sign[]={0xD0,0xCF,0x11,0xE0,0xA1,0xB1,0x1A,0xE1,0}; + + +/** + * Initializes ole structure + * + * @param f (FILE *) compound document file, positioned at bufSize + * byte. Might be pipe or socket + * @param buffer (void *) bytes already read from f + * @param bufSize number of bytes already read from f should be less + * than 512 + * + * @return + */ +FILE* ole_init(FILE *f, void *buffer, size_t bufSize) { + unsigned char oleBuf[BBD_BLOCK_SIZE]; + unsigned char *tmpBuf; + FILE *newfile; + int ret=0, i; + long int sbdMaxLen, sbdCurrent, propMaxLen, propCurrent, mblock, msat_size; + oleEntry *tEntry; + + /* deleting old data (if it was allocated) */ + ole_finish(); + + if (fseek(f,0,SEEK_SET) == -1) { + if ( errno == ESPIPE ) { + /* We got non-seekable file, create temp file */ + if((newfile=tmpfile()) == NULL) { + perror("Can't create tmp file"); + return NULL; + } + if (bufSize > 0) { + ret=fwrite(buffer, 1, bufSize, newfile); + if(ret != bufSize) { + perror("Can't write to tmp file"); + return NULL; + } + } + + while(!feof(f)){ + ret=fread(oleBuf,1,BBD_BLOCK_SIZE,f); + fwrite(oleBuf, 1, ret, newfile); + } + fseek(newfile,0,SEEK_SET); + } else { + perror("Can't seek in file"); + return NULL; + } + } else { + newfile=f; + } + fseek(newfile,0,SEEK_END); + fileLength=ftell(newfile); +/* fprintf(stderr, "fileLength=%ld\n", fileLength); */ + fseek(newfile,0,SEEK_SET); + ret=fread(oleBuf,1,BBD_BLOCK_SIZE,newfile); + if ( ret != BBD_BLOCK_SIZE ) { + return NULL; + } + if (strncmp(oleBuf,ole_sign,8) != 0) { + return NULL; + } + sectorSize = 1<= 0) && (i < msat_size)) { + unsigned char *newbuf; +/* fprintf(stderr, "i=%d mblock=%ld\n", i, mblock); */ + if ((newbuf=realloc(tmpBuf, sectorSize*(i+1)+MSAT_ORIG_SIZE)) != NULL) { + tmpBuf=newbuf; + } else { + perror("MSAT realloc error"); + free(tmpBuf); + ole_finish(); + return NULL; + } + + fseek(newfile, 512+mblock*sectorSize, SEEK_SET); + if(fread(tmpBuf+MSAT_ORIG_SIZE+(sectorSize-4)*i, + 1, sectorSize, newfile) != sectorSize) { + fprintf(stderr, "Error read MSAT!\n"); + ole_finish(); + return NULL; + } + + i++; + mblock=getlong(tmpBuf, MSAT_ORIG_SIZE+(sectorSize-4)*i); + } + +/* fprintf(stderr, "bbdNumBlocks=%ld\n", bbdNumBlocks); */ + for(i=0; i< bbdNumBlocks; i++) { + long int bbdSector=getlong(tmpBuf,4*i); + + if (bbdSector >= fileLength/sectorSize || bbdSector < 0) { + fprintf(stderr, "Bad BBD entry!\n"); + ole_finish(); + return NULL; + } + fseek(newfile, 512+bbdSector*sectorSize, SEEK_SET); + if ( fread(BBD+i*sectorSize, 1, sectorSize, newfile) != sectorSize ) { + fprintf(stderr, "Can't read BBD!\n"); + free(tmpBuf); + ole_finish(); + return NULL; + } + } + free(tmpBuf); + +/* Read SBD into memory */ + sbdLen=0; + sbdMaxLen=10; + sbdCurrent = sbdStart = getlong(oleBuf,0x3c); + if (sbdStart > 0) { + if((SBD=malloc(sectorSize*sbdMaxLen)) == NULL ) { + ole_finish(); + return NULL; + } + while(1) { + fseek(newfile, 512+sbdCurrent*sectorSize, SEEK_SET); + fread(SBD+sbdLen*sectorSize, 1, sectorSize, newfile); + sbdLen++; + if (sbdLen >= sbdMaxLen) { + unsigned char *newSBD; + + sbdMaxLen+=5; + if ((newSBD=realloc(SBD, sectorSize*sbdMaxLen)) != NULL) { + SBD=newSBD; + } else { + perror("SBD realloc error"); + ole_finish(); + return NULL; + } + } + sbdCurrent = getlong(BBD, sbdCurrent*4); + if(sbdCurrent < 0 || + sbdCurrent >= fileLength/sectorSize) + break; + } + sbdNumber = (sbdLen*sectorSize)/shortSectorSize; +/* fprintf(stderr, "sbdLen=%ld sbdNumber=%ld\n",sbdLen, sbdNumber); */ + } else { + SBD=NULL; + } +/* Read property catalog into memory */ + propLen = 0; + propMaxLen = 5; + propCurrent = propStart = getlong(oleBuf,0x30); + if (propStart >= 0) { + if((properties=malloc(propMaxLen*sectorSize)) == NULL ) { + ole_finish(); + return NULL; + } + while(1) { +/* fprintf(stderr, "propCurrent=%ld\n",propCurrent); */ + fseek(newfile, 512+propCurrent*sectorSize, SEEK_SET); + fread(properties+propLen*sectorSize, + 1, sectorSize, newfile); + propLen++; + if (propLen >= propMaxLen) { + unsigned char *newProp; + + propMaxLen+=5; + if ((newProp=realloc(properties, propMaxLen*sectorSize)) != NULL) + properties=newProp; + else { + perror("Properties realloc error"); + ole_finish(); + return NULL; + } + } + + propCurrent = getlong(BBD, propCurrent*4); + if(propCurrent < 0 || + propCurrent >= fileLength/sectorSize ) { + break; + } + } +/* fprintf(stderr, "propLen=%ld\n",propLen); */ + propNumber = (propLen*sectorSize)/PROP_BLOCK_SIZE; + propCurNumber = 0; + } else { + ole_finish(); + properties = NULL; + return NULL; + } + + +/* Find Root Entry */ + while((tEntry=(oleEntry*)ole_readdir(newfile)) != NULL) { + if (!tEntry->name[0]||strcmp(tEntry->name,"Root Entry") == 0) { + rootEntry=tEntry; + break; + } + ole_close((FILE*)tEntry); + } + propCurNumber = 0; + fseek(newfile, 0, SEEK_SET); + if (!rootEntry) { + fprintf(stderr,"Cannot find root entry in this file!\n"); + ole_finish(); + return NULL; + } + return newfile; +} + +/** + * + * + * @param oleBuf + * + * @return + */ +int rightOleType(unsigned char *oleBuf) { + return (oleBuf[0x42] == 1 || oleBuf[0x42] == 2 || + oleBuf[0x42] == 3 || oleBuf[0x42] == 5 ); +} + +/** + * + * + * @param oleBuf + * + * @return + */ +oleType getOleType(unsigned char *oleBuf) { + return (oleType)((unsigned char)oleBuf[0x42]); +} + +/** + * Reads next directory entry from file + * + * @param name buffer for name converted to us-ascii should be at least 33 chars long + * @param size size of file + * + * @return 0 if everything is ok -1 on error + */ +FILE *ole_readdir(FILE *f) { + int i, nLen; + unsigned char *oleBuf; + oleEntry *e=NULL; + long int chainMaxLen, chainCurrent; + + if ( properties == NULL || propCurNumber >= propNumber || f == NULL ) + return NULL; + oleBuf=properties + propCurNumber*PROP_BLOCK_SIZE; + if( !rightOleType(oleBuf)) + return NULL; + if ((e = (oleEntry*)malloc(sizeof(oleEntry))) == NULL) { + perror("Can\'t allocate memory"); + return NULL; + } + e->dirPos=oleBuf; + e->type=getOleType(oleBuf); + e->file=f; + e->startBlock=getlong(oleBuf,0x74); + e->blocks=NULL; + + nLen=getshort(oleBuf,0x40); + for (i=0 ; i < nLen /2; i++) + e->name[i]=(char)oleBuf[i*2]; + e->name[i]='\0'; + propCurNumber++; + e->length=getulong(oleBuf,0x78); +/* Read sector chain for object */ + chainMaxLen = 25; + e->numOfBlocks = 0; + chainCurrent = e->startBlock; + e->isBigBlock = (e->length >= 0x1000) || !strcmp(e->name, "Root Entry"); +/* fprintf(stderr, "e->name=%s e->length=%ld\n", e->name, e->length); */ +/* fprintf(stderr, "e->startBlock=%ld BBD=%p\n", e->startBlock, BBD); */ + if (e->startBlock >= 0 && + e->length >= 0 && + (e->startBlock <= + fileLength/(e->isBigBlock ? sectorSize : shortSectorSize))) { + if((e->blocks=malloc(chainMaxLen*sizeof(long int))) == NULL ) { + return NULL; + } + while(1) { +/* fprintf(stderr, "chainCurrent=%ld\n", chainCurrent); */ + e->blocks[e->numOfBlocks++] = chainCurrent; + if (e->numOfBlocks >= chainMaxLen) { + long int *newChain; + chainMaxLen+=25; + if ((newChain=realloc(e->blocks, + chainMaxLen*sizeof(long int))) != NULL) + e->blocks=newChain; + else { + perror("Properties realloc error"); + free(e->blocks); + e->blocks=NULL; + return NULL; + } + } + if ( e->isBigBlock ) { + chainCurrent = getlong(BBD, chainCurrent*4); + } else if ( SBD != NULL ) { + chainCurrent = getlong(SBD, chainCurrent*4); + } else { + chainCurrent=-1; + } + if(chainCurrent <= 0 || + chainCurrent >= ( e->isBigBlock ? + ((bbdNumBlocks*sectorSize)/4) + : ((sbdNumber*shortSectorSize)/4) ) || + (e->numOfBlocks > + e->length/(e->isBigBlock ? sectorSize : shortSectorSize))) { +/* fprintf(stderr, "chain End=%ld\n", chainCurrent); */ + break; + } + } + } + + if(e->length > (e->isBigBlock ? sectorSize : shortSectorSize)*e->numOfBlocks) + e->length = (e->isBigBlock ? sectorSize : shortSectorSize)*e->numOfBlocks; +/* fprintf(stderr, "READDIR: e->name=%s e->numOfBlocks=%ld length=%ld\n", */ +/* e->name, e->numOfBlocks, e->length); */ + + return (FILE*)e; +} + +/** + * Open stream, which correspond to directory entry last read by + * ole_readdir + * + * + * @return opaque pointer to pass to ole_read, casted to (FILE *) + */ +int ole_open(FILE *stream) { + oleEntry *e=(oleEntry *)stream; + if ( e->type != oleStream) + return -2; + + e->ole_offset=0; + e->file_offset= ftell(e->file); + return 0; +} + +/** + * + * + * @param e + * @param blk + * + * @return + */ +long int calcFileBlockOffset(oleEntry *e, long int blk) { + long int res; + if ( e->isBigBlock ) { + res=512+e->blocks[blk]*sectorSize; + } else { + long int sbdPerSector=sectorSize/shortSectorSize; + long int sbdSecNum=e->blocks[blk]/sbdPerSector; + long int sbdSecMod=e->blocks[blk]%sbdPerSector; +/* fprintf(stderr, "calcoffset: e->name=%s e->numOfBlocks=%ld length=%ld sbdSecNum=%ld rootEntry->blocks=%p\n", + e->name, e->numOfBlocks, e->length, sbdSecNum, rootEntry->blocks);*/ + res=512 + rootEntry->blocks[sbdSecNum]*sectorSize + sbdSecMod*shortSectorSize; + } + return res; +} + + +/** + * Reads block from open ole stream interface-compatible with fread + * + * @param ptr pointer to buffer for read to + * @param size size of block + * @param nmemb size in blocks + * @param stream pointer to FILE* structure + * + * @return number of readed blocks + */ +size_t ole_read(void *ptr, size_t size, size_t nmemb, FILE *stream) { + oleEntry *e = (oleEntry*)stream; + long int llen = size*nmemb, rread=0, i; + long int blockNumber, modBlock, toReadBlocks, toReadBytes, bytesInBlock; + long int ssize; /**< Size of block */ + long int newoffset; + unsigned char *cptr = ptr; + if( e->ole_offset+llen > e->length ) + llen= e->length - e->ole_offset; + + ssize = (e->isBigBlock ? sectorSize : shortSectorSize); + blockNumber=e->ole_offset/ssize; +/* fprintf(stderr, "blockNumber=%ld e->numOfBlocks=%ld llen=%ld\n", */ +/* blockNumber, e->numOfBlocks, llen); */ + if ( blockNumber >= e->numOfBlocks || llen <=0 ) + return 0; + + modBlock=e->ole_offset%ssize; + bytesInBlock = ssize - modBlock; + if(bytesInBlock < llen) { + toReadBlocks = (llen-bytesInBlock)/ssize; + toReadBytes = (llen-bytesInBlock)%ssize; + } else { + toReadBlocks = toReadBytes = 0; + } +/* fprintf(stderr, "llen=%ld toReadBlocks=%ld toReadBytes=%ld bytesInBlock=%ld blockNumber=%ld modBlock=%ld\n", */ +/* llen, toReadBlocks, toReadBytes, bytesInBlock, blockNumber, modBlock); */ + newoffset = calcFileBlockOffset(e,blockNumber)+modBlock; + if (e->file_offset != newoffset) { + fseek(e->file, e->file_offset=newoffset, SEEK_SET); + } + rread=fread(ptr, 1, min(llen,bytesInBlock), e->file); + e->file_offset += rread; + for(i=0; ifile_offset); + fseek(e->file, e->file_offset=newoffset , SEEK_SET); + readbytes=fread(cptr+rread, 1, min(llen-rread, ssize), e->file); + rread +=readbytes; + e->file_offset +=readbytes; + } + if(toReadBytes > 0) { + int readbytes; + blockNumber++; + newoffset = calcFileBlockOffset(e,blockNumber); + fseek(e->file, e->file_offset=newoffset, SEEK_SET); + readbytes=fread(cptr+rread, 1, toReadBytes,e ->file); + rread +=readbytes; + e->file_offset +=readbytes; + } +/* fprintf(stderr, "ole_offset=%ld rread=%ld llen=%ld\n", + e->ole_offset, rread, llen);*/ + e->ole_offset+=rread; + return rread; +} + +/** + * + * + * @param stream + * + * @return + */ +int ole_eof(FILE *stream) { + oleEntry *e=(oleEntry*)stream; +/* fprintf(stderr, "EOF: e->ole_offset=%ld e->length=%ld\n", + e->ole_offset, e->length);*/ + return (e->ole_offset >= e->length); +} + +/** + * + * + */ +void ole_finish(void) { + if ( BBD != NULL ) free(BBD); + if ( SBD != NULL ) free(SBD); + if ( properties != NULL ) free(properties); + if ( rootEntry != NULL ) ole_close((FILE*)rootEntry); + properties = SBD = BBD = NULL; + rootEntry = NULL; +} + +/** + * + * + * @param stream + * + * @return + */ +int ole_close(FILE *stream) { + oleEntry *e=(oleEntry*)stream; + if(e == NULL) + return -1; + if (e->blocks != NULL) + free(e->blocks); + free(e); + return 0; +} + +/** + * + * + * @param stream pointer to OLE stream structure + * @param offset + * @param whence + * + * @return + */ +int ole_seek(FILE *stream, long offset, int whence) { + oleEntry *e=(oleEntry*)stream; + long int new_ole_offset=0, new_file_offset; + int ssize, modBlock, blockNumber; + + switch(whence) { + case SEEK_SET: + new_ole_offset=offset; + break; + + case SEEK_CUR: + new_ole_offset=e->ole_offset+offset; + break; + + case SEEK_END: + new_ole_offset=e->length+offset; + break; + + default: + errno=EINVAL; + return -1; + } + if(new_ole_offset<0) + new_ole_offset=0; + if(new_ole_offset >= e->length) + new_ole_offset=e->length; + + ssize = (e->isBigBlock ? sectorSize : shortSectorSize); + blockNumber=new_ole_offset/ssize; + if ( blockNumber >= e->numOfBlocks ) + return -1; + + modBlock=new_ole_offset%ssize; + new_file_offset = calcFileBlockOffset(e,blockNumber)+modBlock; + fseek(e->file, e->file_offset=new_file_offset, SEEK_SET); + e->ole_offset=new_ole_offset; + + return 0; +} + +/** + * Tell position inside OLE stream + * + * @param stream pointer to OLE stream + * + * @return current position inside OLE stream + */ +long ole_tell(FILE *stream) { + oleEntry *e=(oleEntry*)stream; + return e->ole_offset; +} + + +/** + * + * + */ +size_t (*catdoc_read)(void *ptr, size_t size, size_t nmemb, FILE *stream); +int (*catdoc_eof)(FILE *stream); +int (*catdoc_seek)(FILE *stream, long offset, int whence); +long (*catdoc_tell)(FILE *stream); + +void set_ole_func(void) { + catdoc_read=ole_read; + catdoc_eof=ole_eof; + catdoc_seek=ole_seek; + catdoc_tell=ole_tell; +} + +#ifdef feof +/* feof is macro in Turbo C, so we need a real function to assign to + * pointer + */ +int my_feof(FILE *f) { + return feof(f); +} +#define FEOF my_feof +#else +#define FEOF feof +#endif + +void set_std_func(void) { + catdoc_read=fread; + catdoc_eof=FEOF; + catdoc_seek=fseek; + catdoc_tell=ftell; +} + diff --git a/src/ole.h b/src/ole.h new file mode 100644 index 0000000..181c7e7 --- /dev/null +++ b/src/ole.h @@ -0,0 +1,67 @@ +/** + * @file ole.h + * @author Alex Ott + * @date 03 éÀÎ 2003 + * Version: $Id: ole.h,v 1.1 2006-02-24 17:44:06 vitus Exp $ + * Copyright: Alex Ott, 2003, + * + * @brief + * + * + */ + +#ifndef _OLE_H +#define _OLE_H 1 + +#define BBD_BLOCK_SIZE 512 +#define SBD_BLOCK_SIZE 64 +#define PROP_BLOCK_SIZE 128 +#define OLENAMELENGHT 32 +#define MSAT_ORIG_SIZE 436 + +typedef enum { + oleDir=1, + oleStream=2, + oleRootDir=5, + oleUnknown=3 +} oleType; + +typedef struct { + FILE *file; + char name[OLENAMELENGHT+1]; + long int startBlock; + long int curBlock; + unsigned long int length; + long int ole_offset; + long int file_offset; + unsigned char *dirPos; + oleType type; + long int numOfBlocks; + long int *blocks; /**< array of blocks numbers */ + int isBigBlock; +} oleEntry; + +/** + * Functions + * + */ +FILE* ole_init(FILE *f, void *buffer, size_t bufSize); +FILE *ole_readdir(FILE *f); +int ole_seek(FILE *stream, long offset, int whence); +long ole_tell(FILE *stream); +int ole_eof(FILE *stream); +size_t ole_read(void *ptr, size_t size, size_t nmemb, FILE *stream); +int ole_open(FILE *); +int ole_close(FILE *); +void ole_finish(void); + +extern size_t (*catdoc_read)(void *ptr, size_t size, size_t nmemb, FILE *stream); +extern int (*catdoc_eof)(FILE *stream); +extern int (*catdoc_seek)(FILE *stream, long offset, int whence); +extern long (*catdoc_tell)(FILE *stream); + +void set_ole_func(void); +void set_std_func(void); + +#endif /* _OLE_H */ + diff --git a/src/ppt.h b/src/ppt.h new file mode 100644 index 0000000..dfc7fb3 --- /dev/null +++ b/src/ppt.h @@ -0,0 +1,22 @@ +/** + * @file ppt.h + * @author Alex Ott + * @date 23 äÅË 2004 + * Version: $Id: ppt.h,v 1.1 2006-02-24 17:44:06 vitus Exp $ + * Copyright: Alex Ott + * + * @brief definitions of .ppt processing functions + * + * + */ + +#ifndef _PPT_H +#define _PPT_H 1 + +#include +#include + +void do_ppt(FILE *input,char *filename); + +#endif /* _PPT_H */ + diff --git a/src/pptparse.c b/src/pptparse.c new file mode 100644 index 0000000..e74c93a --- /dev/null +++ b/src/pptparse.c @@ -0,0 +1,286 @@ +/** + * @file pptparse.c + * @author Alex Ott + * @date 23 äÅË 2004 + * Version: $Id: pptparse.c,v 1.1 2006-02-24 17:44:06 vitus Exp $ + * Copyright: Alex Ott + * + * @brief .ppt parsing routines + * + * + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#include "ppt.h" +#include "catdoc.h" +#include "ppttypes.h" + +static void process_item (int rectype, long reclen, FILE* input); + +#if !defined(min) +#define min(x,y) ((x) < (y) ? (x) : (y)) +#endif + + +/** + * + * + * @param input + * @param filename + */ +void do_ppt(FILE *input,char *filename) { + int itemsread=1; + int rectype; + long reclen; + unsigned char recbuf[8]; + + while(itemsread) { + itemsread = catdoc_read(recbuf, 1, 8, input); +/* fprintf(stderr,"itemsread=%d: ",itemsread); */ +/* for(i=0; i<8; i++) */ +/* fprintf(stderr,"%02x ",recbuf[i]); */ +/* fprintf(stderr,"\n"); */ + + if (catdoc_eof(input)) { + process_item(DOCUMENT_END,0,input); + return; + } + if(itemsread < 8) + break; + rectype=getshort(recbuf,2); + reclen=getulong(recbuf,4); + if (reclen < 0) { + return; + } + process_item(rectype,reclen,input); + } +} + + +/** + * + * + * @param rectype + * @param reclen + * @param input + */ +static void process_item (int rectype, long reclen, FILE* input) { + int i=0, u; + static char buf[2]; + + switch(rectype) { + case DOCUMENT_END: +/* fprintf(stderr,"End of document, ended at %ld\n",catdoc_tell(input)); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case DOCUMENT: +/* fprintf(stderr,"Start of document, reclen=%ld, started at %ld\n", reclen, */ +/* catdoc_tell(input)); */ + break; + + case DOCUMENT_ATOM: +/* fprintf(stderr,"DocumentAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case SLIDE: +/* fprintf(stderr,"Slide, reclen=%ld\n", reclen); */ +/* fputs("---------------------------------------\n",stderr); */ + break; + + case SLIDE_ATOM: +/* fprintf(stderr,"SlideAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case SLIDE_BASE: +/* fprintf(stderr,"SlideBase, reclen=%ld\n", reclen); */ + break; + + case SLIDE_BASE_ATOM: +/* fprintf(stderr,"SlideBaseAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case NOTES: +/* fprintf(stderr,"Notes, reclen=%ld\n", reclen); */ + break; + + case NOTES_ATOM: +/* fprintf(stderr,"NotesAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case HEADERS_FOOTERS: +/* fprintf(stderr,"HeadersFooters, reclen=%ld\n", reclen); */ + break; + + case HEADERS_FOOTERS_ATOM: +/* fprintf(stderr,"HeadersFootersAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case MAIN_MASTER: +/* fprintf(stderr,"MainMaster, reclen=%ld\n", reclen); */ + break; + + case TEXT_BYTES_ATOM: { +/* fprintf(stderr,"TextBytes, reclen=%ld\n", reclen); */ + for(i=0; i < reclen; i++) { + catdoc_read(buf,1,1,input); + if((unsigned char)*buf!=0x0d) + fputs(convert_char((unsigned char)*buf),stdout); + else + fputc('\n',stdout); + } + fputc('\n',stdout); + } + break; + + case TEXT_CHARS_ATOM: + case CSTRING: { + long text_len; + +/* fprintf(stderr,"CString, reclen=%ld\n", reclen); */ + text_len=reclen/2; + for(i=0; i < text_len; i++) { + catdoc_read(buf,2,1,input); + u=(unsigned short)getshort(buf,0); + if(u!=0x0d) + fputs(convert_char(u),stdout); + else + fputc('\n',stdout); + } + fputc('\n',stdout); + } + break; + + case USER_EDIT_ATOM: +/* fprintf(stderr,"UserEditAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case COLOR_SCHEME_ATOM: +/* fprintf(stderr,"ColorSchemeAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case PPDRAWING: +/* fprintf(stderr,"PPDrawing, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case ENVIRONMENT: +/* fprintf(stderr,"Environment, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case SSDOC_INFO_ATOM: +/* fprintf(stderr,"SSDocInfoAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case SSSLIDE_INFO_ATOM: +/* fprintf(stderr,"SSSlideInfoAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case PROG_TAGS: +/* fprintf(stderr,"ProgTags, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case PROG_STRING_TAG: +/* fprintf(stderr,"ProgStringTag, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case PROG_BINARY_TAG: +/* fprintf(stderr,"ProgBinaryTag, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case LIST: +/* fprintf(stderr,"List, reclen=%ld\n", reclen); */ + break; + + case SLIDE_LIST_WITH_TEXT: +/* fprintf(stderr,"SlideListWithText, reclen=%ld\n", reclen); */ +/* fputs("---------------------------------------\n",stderr); */ + break; + + case PERSIST_PTR_INCREMENTAL_BLOCK: +/* fprintf(stderr,"PersistPtrIncrementalBlock, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case EX_OLE_OBJ_STG: +/* fprintf(stderr,"ExOleObjStg, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case PPDRAWING_GROUP: +/* fprintf(stderr,"PpdrawingGroup, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case EX_OBJ_LIST: +/* fprintf(stderr,"ExObjList, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case TX_MASTER_STYLE_ATOM: +/* fprintf(stderr,"TxMasterStyleAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case HANDOUT: +/* fprintf(stderr,"Handout, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case SLIDE_PERSIST_ATOM: +/* fprintf(stderr,"SlidePersistAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case TEXT_HEADER_ATOM: +/* fprintf(stderr,"TextHeaderAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case TEXT_SPEC_INFO: +/* fprintf(stderr,"TextSpecInfo, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + case STYLE_TEXT_PROP_ATOM: +/* fprintf(stderr,"StyleTextPropAtom, reclen=%ld\n", reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + break; + + /* case : + fprintf(stderr,", reclen=%ld\n", reclen); + catdoc_seek(input, reclen, SEEK_CUR); + break;*/ + + /* case : + fprintf(stderr,", reclen=%ld\n", reclen); + catdoc_seek(input, reclen, SEEK_CUR); + break;*/ + + default: +/* fprintf(stderr,"Default action for rectype=%d reclen=%ld\n", */ +/* rectype, reclen); */ + catdoc_seek(input, reclen, SEEK_CUR); + + } + +} diff --git a/src/ppttypes.h b/src/ppttypes.h new file mode 100644 index 0000000..521e0b3 --- /dev/null +++ b/src/ppttypes.h @@ -0,0 +1,63 @@ +/** + * @file ppttypes.h + * @author Alex Ott + * @date 26 äÅË 2004 + * Version: $Id: ppttypes.h,v 1.1 2006-02-24 17:44:06 vitus Exp $ + * Copyright: Alex Ott + * + * @brief Enumerations for .ppt records + * + * + */ + +#ifndef _PPTTYPES_H +#define _PPTTYPES_H 1 + +#define UNKNOWN 0 +#define DOCUMENT 1000 +#define DOCUMENT_ATOM 1001 +#define DOCUMENT_END 1002 +#define SLIDE_PERSIST 1003 +#define SLIDE_BASE 1004 +#define SLIDE_BASE_ATOM 1005 +#define SLIDE 1006 +#define SLIDE_ATOM 1007 +#define NOTES 1008 +#define NOTES_ATOM 1009 +#define ENVIRONMENT 1010 +#define SLIDE_PERSIST_ATOM 1011 +#define MAIN_MASTER 1016 +#define SSSLIDE_INFO_ATOM 1017 +#define SSDOC_INFO_ATOM 1025 +#define EX_OBJ_LIST 1033 +#define PPDRAWING_GROUP 1035 +#define PPDRAWING 1036 +#define LIST 2000 +#define COLOR_SCHEME_ATOM 2032 +#define TEXT_HEADER_ATOM 3999 +#define TEXT_CHARS_ATOM 4000 +#define STYLE_TEXT_PROP_ATOM 4001 +#define TX_MASTER_STYLE_ATOM 4003 +#define TEXT_BYTES_ATOM 4008 +#define TEXT_CISTYLE_ATOM 4008 +#define TEXT_SPEC_INFO 4010 +#define EX_OLE_OBJ_STG 4113 +#define CSTRING 4026 +#define HANDOUT 4041 +#define HEADERS_FOOTERS 4057 +#define HEADERS_FOOTERS_ATOM 4058 +#define SLIDE_LIST_WITH_TEXT 4080 +#define SLIDE_LIST 4084 +#define USER_EDIT_ATOM 4085 +#define PROG_TAGS 5000 +#define PROG_STRING_TAG 5001 +#define PROG_BINARY_TAG 5002 +#define PERSIST_PTR_INCREMENTAL_BLOCK 6002 +/* #define */ +/* #define */ +/* #define */ +/* #define */ +/* #define */ + +#endif /* _PPTTYPES_H */ + diff --git a/src/reader.c b/src/reader.c new file mode 100644 index 0000000..b51996e --- /dev/null +++ b/src/reader.c @@ -0,0 +1,224 @@ +/*****************************************************************/ +/* Reading routines for MS-Word, MS-Write and text files */ +/* */ +/* This file is part of catdoc project */ +/* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003 */ +/*****************************************************************/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include "catdoc.h" +unsigned short int buffer[PARAGRAPH_BUFFER]; +static unsigned char read_buf[256]; +static int buf_is_unicode; + +/**************************************************************************/ +/* Just prints out content of input file. Called when file is not OLE */ +/* stream */ +/* Parameters - f - file to copy out. header - first few bytes of file, */ +/* which have been already read by format recognition code, but should */ +/* be output anyway */ +/**************************************************************************/ +void copy_out (FILE *f,char *header) { + char *buf=(char *)buffer; + int count,i; + long offset; + if (get_unicode_char == get_word8_char) { + /* non-word file and -u specified. Trying to guess which kind of + * unicode is used + */ + if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) { + get_unicode_char = get_utf16msb; + fputs(convert_char(header[2]<<8|header[3]),stdout); + fputs(convert_char(header[4]<<8|header[5]),stdout); + fputs(convert_char(header[6]<<8|header[7]),stdout); + } else if ((unsigned char)header[0]!=0xFF || + (unsigned char)header[1]!=0xFE) { + int c,j,d; + /* if it is not utf16, assume it is UTF8. We are told -u, + * aren't we */ + get_unicode_char = get_utf8; + i=0; + while (i<8) { + c=(unsigned char)header[i++]; + if (c >=0x80) { + if ( c<0xE0) { + c=(c & 0x1F); + count =1; + } else { + c=(c & 0xF); + count = 2; + } + for (j=0;j0) { + buffer[++bufptr]=0; + output_paragraph(buffer); + } + } + return 0; +} +/**********************************************************************/ +/* Reads file from MS-Word 97 and above file. Takes in account strange* + * situation that unicode and non-unicode 256-byte blocks could be * + * intermixed in word file * + * * + * Parameters: * + * * + * f - file to read * + * offset - position of the character inside file (to determine * + * possible block boundaries * + **********************************************************************/ +int get_word8_char(FILE *f,long *offset,long fileend) { + int count,i,u; + char c; + if ((i=(*offset)%256) ==0) { + count=catdoc_read(read_buf,1,256,f); + memset(read_buf+count,0,256-count); + buf_is_unicode=0; + if (*offset+(long)count>fileend) { + count=fileend-*offset; + } + while (i +#endif + +#include +#include +#include +#include +#include "catdoc.h" + +/******************************************************** + * Datatypes declaration + * + */ +typedef enum { + RTF_CODEPAGE, + RTF_FONT_CHARSET, + RTF_UC, + RTF_UNICODE_CHAR, + RTF_CHAR, + RTF_PARA, + RTF_TABLE_START, + RTF_TABLE_END, + RTF_ROW, + RTF_CELL, + RTF_UNKNOWN, + RTF_OVERLAY, + RTF_PICT, + RTF_F, + RTF_AUTHOR, + RTF_FONTTBL, + RTF_INFO, + RTF_STYLESHEET, + RTF_COLORTBL, + RTF_LISTOVERRIDETABLE, + RTF_LISTTABLE, + RTF_RSIDTBL, + RTF_GENERATOR, + RTF_DATAFIELD, + RTF_LANG, + RTF_PARD, + RTF_TAB, + RTF_SPEC_CHAR, + RTF_EMDASH, + RTF_ENDASH, + RTF_EMSPACE, + RTF_ENSPACE, + RTF_BULLET, + RTF_LQUOTE, + RTF_RQUOTE, + RTF_LDBLQUOTE, + RTF_RDBLQUOTE, + RTF_ZWNONJOINER, +} RTFTypes; + +typedef struct { + char *name; + RTFTypes type; +} RTFTypeMap; + +RTFTypeMap rtf_types[]={ + {"uc",RTF_UC}, + {"ansicpg",RTF_CODEPAGE}, + {"pard",RTF_PARD}, + {"par",RTF_PARA}, + {"cell",RTF_CELL}, + {"row",RTF_ROW}, + {"overlay",RTF_OVERLAY}, + {"pict",RTF_PICT}, + {"author",RTF_AUTHOR}, + {"f",RTF_F}, + {"fonttbl",RTF_FONTTBL}, + {"info",RTF_INFO}, + {"stylesheet",RTF_STYLESHEET}, + {"colortbl",RTF_COLORTBL}, + {"listtable",RTF_LISTTABLE}, + {"listoverridetable",RTF_LISTOVERRIDETABLE}, + {"rsidtbl",RTF_RSIDTBL}, + {"generator",RTF_GENERATOR}, + {"datafield",RTF_DATAFIELD}, + {"lang",RTF_LANG}, + {"tab",RTF_TAB}, + {"emdash",RTF_EMDASH}, + {"endash",RTF_ENDASH}, + {"emspace",RTF_EMDASH}, + {"enspace",RTF_ENDASH}, + {"bullet",RTF_BULLET}, + {"lquote",RTF_LQUOTE}, + {"rquote",RTF_RQUOTE}, + {"ldblquote",RTF_LDBLQUOTE}, + {"rdblquote",RTF_RDBLQUOTE}, + {"zwnj",RTF_ZWNONJOINER}, +/* {"",}, */ +/* {"",}, */ + {"u",RTF_UNICODE_CHAR} +}; + +#define RTFNAMEMAXLEN 32 +#define RTFARGSMAXLEN 64 + +/** + * Structure describing rtf command + * + */ +typedef struct { + RTFTypes type; + char name[RTFNAMEMAXLEN+1]; + signed int numarg; +/* void *args; */ +} RTFcommand; + + +#define MAXFONTNAME 64 +/** + * + * + */ +typedef struct { + int name; + char fontname[MAXFONTNAME+1]; +} RTFFont; + +/** + * Structure to describe style + * + */ +typedef struct { + int codepage; +} RTFStyle; + +/** + * Structure to store values, local to rtf group + * + */ +typedef struct { + int uc; /**< How much symbols to skip */ + RTFStyle* style; /**< curren style */ +} RTFGroupData; + +/******************************************************** + * Functions declaration + * + */ + +extern int forced_charset; +signed long getNumber(FILE *f); + +int getRtfCommand(FILE *f, RTFcommand *command ); +unsigned short int rtf_to_unicode(int code); +RTFTypes getCommandType(char *name); +signed int getCharCode(FILE *f); +void rtfSetCharset(short int **charset_ptr,unsigned int codepage); + +/******************************************************** + * Global data + * + */ +short int *current_charset; +int rtf_level=0; + +/******************************************************** + * Functions implementation + * + */ +extern unsigned short int buffer[]; +void add_to_buffer(int *bufptr,unsigned short int c) { + buffer[++(*bufptr)]=c; + if (*bufptr > PARAGRAPH_BUFFER-2) { + buffer[++(*bufptr)]=0; + output_paragraph(buffer); + *bufptr=-1; + } +} + +void end_paragraph(int *bufptr) { + add_to_buffer(bufptr,0x000a); + add_to_buffer(bufptr,0); + output_paragraph(buffer); + *bufptr=-1; +} + +/** + * Parses RTF file from file stream + * + * @param f - file stream descriptor + */ +int parse_rtf(FILE *f) { + int para_mode=0, data_skip_mode=0,i; + RTFGroupData *groups=NULL; + int group_count=0, group_store=20; + int bufptr=-1; + current_charset=source_charset; + fseek(f,0,SEEK_SET); + if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) { + perror("Can\'t allocate memory: "); + return 1; + } + groups[0].uc = 2; /* DEfault uc = 2 */ + while ( !feof(f) ) { + int c = fgetc(f); + if ( feof( f ) ) + break; + switch (c) { + case '\\': { + int code; + RTFcommand com; + if ((code=getRtfCommand(f, &com)) != 0) + break; + switch (com.type) { + case RTF_SPEC_CHAR: +/* fprintf(stderr, "Spec Char found=%s and arg=%c\n", */ +/* com.name, com.numarg); */ + if (com.numarg == '*' && data_skip_mode == 0) { + data_skip_mode=group_count; + } else if (com.numarg == '\r') { + end_paragraph(&bufptr); + } else if (com.numarg == '~') { + add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */ + } else if (com.numarg == '-') { + add_to_buffer(&bufptr,0xAD);/* Optional hyphen */ + } + + break; + case RTF_EMDASH: + add_to_buffer(&bufptr,0x2014);/* EM DASH*/ + break; + case RTF_ENDASH: + add_to_buffer(&bufptr,0x2013);break; + case RTF_BULLET: + add_to_buffer(&bufptr,0x2022);break; + case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break; + case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break; + case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break; + case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break; + case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break; + case RTF_EMSPACE: + case RTF_ENSPACE: + add_to_buffer(&bufptr,' ');break; + case RTF_CHAR: +/* fprintf(stderr, "RTF char %d\n", com.numarg); */ + if (data_skip_mode == 0) { + add_to_buffer(&bufptr,rtf_to_unicode(com.numarg)); + } + break; + case RTF_UC: + groups[group_count].uc=com.numarg; + break; + case RTF_TAB: + add_to_buffer(&bufptr,0x0009); + break; + case RTF_UNICODE_CHAR: + if (com.numarg < 0) + break; +/* fprintf(stderr, "Unicode char %d\n", com.numarg); */ + if (data_skip_mode == 0) + add_to_buffer(&bufptr,com.numarg); + i=groups[group_count].uc; + while((--i)>0) + fgetc(f); + break; + case RTF_PARA: + /*if (para_mode > 0) {*/ + end_paragraph(&bufptr); + /*}*/ + para_mode=group_count; + break; + case RTF_PICT: + case RTF_FONTTBL: + case RTF_INFO: + case RTF_COLORTBL: + case RTF_STYLESHEET: + case RTF_LISTTABLE: + case RTF_LISTOVERRIDETABLE: + case RTF_RSIDTBL: + case RTF_GENERATOR: + case RTF_DATAFIELD: + if (data_skip_mode == 0){ + data_skip_mode=group_count; + } + break; + case RTF_LANG: +/* fprintf(stderr, "Selected lang = %d\n",com.numarg); */ + break; + case RTF_CODEPAGE: + rtfSetCharset(¤t_charset,com.numarg); + default: +/* fprintf(stderr, "Unknown command with name %s and arg=%d\n", */ +/* com.name, com.numarg); */ + ; + } + break; + } + case '{': + group_count++; + if (group_count >= group_store ) { + group_store+=10; + if((groups=(RTFGroupData*)realloc(groups, + group_store*sizeof(RTFGroupData))) + == NULL ) { + perror("Can\'t allocate memory: "); + return 1; + } + } + if (para_mode) + add_to_buffer(&bufptr,0x20); + groups[group_count]=groups[group_count-1]; + break; + case '}': + group_count--; + if(group_count < 0) + group_count=0; + if(para_mode > 0 && para_mode > group_count) { + /*add_to_buffer(&bufptr,0); + output_paragraph(buffer); + fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr); + bufptr=-1;*/ + para_mode=0; + } + if(data_skip_mode > group_count) { + data_skip_mode=0; + } + break; + default: + if (data_skip_mode == 0) + if (c != '\n' && c != '\r') + add_to_buffer(&bufptr,rtf_to_unicode(c)); + } + } + if (bufptr>=0) { + add_to_buffer(&bufptr,'\n'); + add_to_buffer(&bufptr,0); + output_paragraph(buffer); + } + free(groups); + return 0; +} + +/** + * Convert text string to number + * + * @param f stream to read data from + * + * @return converted number + */ +signed long getNumber(FILE *f) { + int c,count=0; + char buf[RTFARGSMAXLEN]; + + while(isdigit(c=fgetc(f)) || c=='-') { + if(feof(f)) + return -1; + buf[count++]=(char)c; + } + ungetc(c,f); + buf[count]='\0'; + return strtol(buf, (char **)NULL, 10); +} + +/** + * Parse command stream from rtf file and fill command structure + * + * @param f - rtf file stream + * @param command - pointer to RTFcommand structure to fill + * + * @return parse code not 0 - error, 0 - success + */ +int getRtfCommand(FILE *f, RTFcommand *command ) { + int c=fgetc(f); + if (isalpha(c)) { + int name_count=1; + command->name[0]=(char)c; + while(isalpha(c=fgetc(f)) && name_count < RTFNAMEMAXLEN) { + if(feof(f)) + return 1; + command->name[name_count++]=(char)c; + } + command->name[name_count]='\0'; + command->type=getCommandType(command->name); +/* command->args=NULL; */ + ungetc(c,f); + if (isdigit(c) || c == '-' ) + command->numarg=getNumber(f); + else + command->numarg=0; + c=fgetc(f); + if(!(c==' ' || c=='\t')) + ungetc(c,f); + } else { + command->name[0]=(char)c; + command->name[1]='\0'; +/* command->args=NULL; */ + if (c == '\'') { + command->type=RTF_CHAR; + command->numarg=getCharCode(f); + if(feof(f)) + return -1; + } else { + command->type=RTF_SPEC_CHAR; + command->numarg=c; + } + } + + return 0; +} + +/** + * Converts char to unicode. + * + * @param code - integer code of char + * + * @return converted char + */ +unsigned short int rtf_to_unicode(int code) { + int cc=code; + if (code < 0 || (cc=to_unicode(current_charset, code)) < 0 ) return 0xFEFF; + return cc; +} + +/** + * Convert name of RTF command to RTFType + * + * @param name name to convert + * + * @return RTFType, if unknown command, then return RTF_UNKNOWN + */ +RTFTypes getCommandType(char *name) { + int i, olen=sizeof(rtf_types)/sizeof(RTFTypeMap); + for (i = 0; i < olen ; i++) { + if ( strcmp(name,rtf_types[i].name) == 0 ) { + return rtf_types[i].type; + } + } + return RTF_UNKNOWN; +} + +/** + * Return number representing char code in Hex + * + * @param f stream to read data from + * + * @return converted number + */ +signed int getCharCode(FILE *f) { + int c,count=0,i; + char buf[RTFARGSMAXLEN]; + for(i=0;i<2; i++) { + if (isdigit(c=fgetc(f))||(c>='a' && c<='f')) { + if(feof(f)) + return -1; + buf[count++]=(char)c; + } else + ungetc(c,f); + } + + buf[count]='\0'; + return strtol(buf, (char **)NULL, 16); +} + +void rtfSetCharset(short int **charset_ptr,unsigned int codepage) +{ + /* Do not override charset if it is specified in the command line */ + const char *charset_name; + char *save_buf = input_buffer; + if (forced_charset) return; + charset_name = charset_from_codepage(codepage); + check_charset(&source_csname,charset_name); + input_buffer=NULL; + *charset_ptr = read_charset(source_csname); + input_buffer = save_buf; +} diff --git a/src/sheet.c b/src/sheet.c new file mode 100644 index 0000000..9b95fe1 --- /dev/null +++ b/src/sheet.c @@ -0,0 +1,148 @@ +/*****************************************************************/ +/* Representation and handling of Excell worksheets in memory */ +/* */ +/* This file is part of catdoc project */ +/* (c) Victor Wagner 1998-2003, (c) Alex Ott 2003 */ +/*****************************************************************/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include "xls.h" +struct rowdescr *rowptr=NULL; +int startrow=0,lastrow=0; +char cell_separator = ','; +int quote_mode = QUOTE_ALL_STRINGS; +char *sheet_separator = "\f"; +/* + * Allocates cell for given row and col and returns pointer to poitrer + * of cell contents + */ +unsigned char **allocate (int row,int col) { + unsigned int newrow,newcol; + if (row>=lastrow) { + newrow=(row/16+1)*16; + rowptr=realloc(rowptr,newrow*sizeof(struct rowdescr)); + if (rowptr == NULL) { + perror("allocating sheet "); + exit(1); + } + memset(rowptr+lastrow,0,(newrow-lastrow)*sizeof(struct rowdescr)); + lastrow=newrow; + } + if (col>=rowptr[row].end) { + newcol=(col/16+1)*16; + rowptr[row].cells=realloc(rowptr[row].cells,newcol*sizeof(char *)); + if (rowptr[row].cells == NULL) { + perror("allocating row"); + exit(1); + } + memset(rowptr[row].cells+rowptr[row].end,0,(newcol-rowptr[row].end) + *sizeof(char *)); + rowptr[row].end=newcol; + } + if (col>rowptr[row].last) rowptr[row].last=col; + return (rowptr[row].cells+col); +} +/* + * Frees up all memory used by sheet + */ +void free_sheet(void) { + int i,j; + struct rowdescr *row; + unsigned char **col; + for (row=rowptr,i=0;icells) continue; + for (col=row->cells,j=0;jend;j++,col++) { + if (*col) { + free(*col); + } + } + free (row->cells); + } + free (rowptr); + rowptr=NULL; + lastrow=0; +} + +/* + * prints out one value with quoting + * uses global variable quote_mode + */ +void print_value(unsigned char *value) +{ + int i,len; + int quotes=0; + if (value != NULL) { + len=strlen((char *)value); + } else { + len = 0; + } + switch (quote_mode) { + case QUOTE_NEVER: + break; + case QUOTE_SPACES_ONLY: + for (i=0;i0&&!rowptr[lastrow].cells) lastrow--; + for(i=0,row=rowptr;i<=lastrow;i++,row++) { + if (row->cells) { + for (j=0,col=row->cells;j<=row->last;j++,col++) { + if (j){ + fputc(cell_separator,stdout); + printed=1; + } + if (*col) { + print_value(*col); + printed=1; + } + } + if (printed) { + fputc('\n',stdout); + printed=0; + } + } + } + fputs(sheet_separator,stdout); +} diff --git a/src/substmap.c b/src/substmap.c new file mode 100644 index 0000000..f29d629 --- /dev/null +++ b/src/substmap.c @@ -0,0 +1,170 @@ +/*****************************************************************/ +/* Substitution maps to replace some unicode characters with */ +/* multicharacter sequences */ +/* */ +/* This file is part of catdoc project */ +/* (c) Victor Wagner 1998-2003, (c) Alex Ott 2003 */ +/*****************************************************************/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include "catdoc.h" +SUBSTMAP spec_chars, replacements; +char *map_path=CHARSETPATH; +void map_insert(SUBSTMAP map, int uc, const char *s) ; +int longest_sequence=6;/* six is longest character sequence which can be + generated by catdoc internally*/ + +/******************************************************************/ +/* checks for terminator of character sequence. If stop is something + * like quote - check for same char. If it is \n, check for any space + ********************************************************************/ +int isstop(char c, char stop) { + if (stop=='\n') { + return isspace(c); + } else { + return c==stop; + } +} +/************************************************************************/ +/* Reads substitution map file. */ +/************************************************************************/ +SUBSTMAP read_substmap(char* filename) { + FILE *f; + SUBSTMAP map=calloc(sizeof(char **),256); + char *path, line[1024], *p, *q; + char s[256]; + char stopchar; + int escaped, lineno=0, i; + unsigned int uc; + path=find_file(filename,add_exe_path(map_path)); + if (!path) { + free(map); + return(NULL); + } + if (!map) { + fprintf(stderr,"Insufficient memory\n"); + exit(1); + } + f=fopen(path,"rb"); + if (!f) { + perror("catdoc"); + return NULL; + } + if (input_buffer) + setvbuf(f,input_buffer,_IOFBF,FILE_BUFFER); + + while (!feof(f)) { + if (!fgets(line,1024,f)) continue; + lineno++; + /* parse line */ + + /* skip leading space */ + for(p=line;*p && isspace(*p);p++); + /* if #, it is comment */ + if (!*p || +#ifdef __MSDOS__ + *p==0x1A || /* DOS have strange habit of using ^Z as eof */ +#endif + *p=='#') continue; + /* read hexadecimal code */ + uc = strtol(p,&p,16); + if (!isspace(*p)|| uc<0 || uc>0xfffd) { + fprintf(stderr,"Error parsing %s(%d)\n",path,lineno); + continue; + } + /* skip space between code and sequence */ + for(;*p && isspace(*p);p++); + if (!p) continue; + switch (*p) { + case '\'': + case '"': + stopchar=*p; + break; + case '(': + stopchar=')'; + break; + case '[': + stopchar=']'; + break; + case '{': + stopchar='}'; + break; + default: + p--; + stopchar='\n'; + } + p++; + q=p; + escaped=0; + while (*q && (!isstop(*q,stopchar) || escaped)) { + if (escaped) { + escaped=0; + } else { + escaped= (*q=='\\'); + } + q++; + } + if (*q!=stopchar && !(isspace(*q) && stopchar=='\n')) { + fprintf(stderr,"Error parsing %s(%d): unterminated sequence\n", + path,lineno); + continue; + } + /* HERE SHOULD BE BACKSLASH ESCAPE PROCESSING !!!*/ + *q=0; + for (q=s,i=0;*p && i<256;q++,i++) { + if (*p!='\\') { + *q=*p++; + } else { + switch (*(++p)) { + case 'n': *q='\n'; break; + case 'r': *q='\r'; break; + case 't': *q='\t'; break; + case 'b': *q='\b'; break; + case '\"': *q='\"'; break; + case '\'': *q='\''; break; + case '0': *q=strtol(p,&p,8); p--; break; + case '\\': + default: + *q=*p; + } + p++; + } + } + *q=0; + if (i>longest_sequence) + longest_sequence=i; + map_insert(map,uc,s); + } + fclose(f); + free(path); + return map; +} + +/*************************************************************************/ +/* searches for string in the substituton map. Returns NULL if not found */ +/*************************************************************************/ +char * map_subst ( SUBSTMAP map, int uc) { + char **p=map[(unsigned)uc >>8]; + if (!p) return NULL; + return p[uc & 0xff]; +} + +/*************************************************************************/ +/* inserts string + unicode code into map */ +/*************************************************************************/ +void map_insert(SUBSTMAP map, int uc, const char *s) { + SUBSTMAP p=map+((unsigned)uc>>8); + + if (!*p) { + *p= calloc(sizeof(char*),256); + if (!*p) { + fprintf(stderr,"Insufficient memory\n"); + exit(1); + } + } + (*p)[uc & 0xff] = strdup(s); +} diff --git a/src/wordview.tcl b/src/wordview.tcl new file mode 100755 index 0000000..bff4f30 --- /dev/null +++ b/src/wordview.tcl @@ -0,0 +1,282 @@ +# -* wish *- +# fallback which allows me to run wordview.tcl without doing make +package require Tcl 8.3 + +if ![info exist charset_lib] { + set charset_lib /usr/local/lib/catdoc +} +option add *Text.Font {Courier 11} widgetDefault +option add *Text.Background white widgetDefault +option add *Text.Foreground black widgetDefault +option add *Text.selectBackground black widgetDefault +option add *Text.selectForeground white widgetDefault +option add *Text.findMode exact widgetDefault +option add *Text.findCase no widgetDefault +option add *Menu.highlightBackground MidnightBlue widgetDefault +option add *Menu.highlightThickness 0 widgetDefault +option add *Menu.activeBackground MidnightBlue widgetDefault +option add *Menu.activeForeground white widgetDefault +option add *Menu.activeBorderWidth 0 widgetDefault +menu .mainmenu +. configure -menu .mainmenu +.mainmenu add cascade -label File -menu [set m [menu .mainmenu.file]] -underline 0 +$m add command -label Open... -command load_file -accelerator Ctrl-O +$m add command -label "Save As..." -command write_file -accelerator Ctrl-S -state disabled +$m add separator +$m add command -label Quit -command exit -accelerator Alt-F4 +set m [menu .mainmenu.edit -postcommand EditEnable] +.mainmenu add cascade -label Edit -menu $m -underline 0 -state disabled +$m add command -label Copy -command CopySel -accelerator Ctrl-C +$m add separator +$m add command -label "Select All" -accelerator Ctrl-A -command \ + {.text tag add sel 0.0 end} +.mainmenu add cascade -label Find -menu .mainmenu.search -underline 1 -state disabled +set m [menu .mainmenu.search -postcommand EnableSearch] +$m add command -label "Find..." -command FindDialog -accelerator Ctrl-F +$m add command -label "Find Again" -accelerator F3 -command DoFind +# +# build charset menu +# + +.mainmenu add cascade -state disabled -label Encoding -menu [set m [menu .mainmenu.encoding]] +$m add radio -label Default -value Default -var in_charset +$m add radio -label unicode -value unicode -var in_charset +foreach l [glob [file join $charset_lib *.txt]] { + set n [file rootname [file tail $l]] + $m add radio -label $n -value $n -var in_charset +} + +set in_charset Default + +trace var in_charset w reread +set m [menu .mainmenu.help] +.mainmenu add cascade -label Help -menu $m -underline 0 +$m add command -label "Manual page" -command [list show_help [file tail $argv0]] +$m add command -label "Regular expressions" -command {show_help re_syntax} +$m add separator +$m add command -label "About..." -command AboutDialog + + + +text .text -width 80 -height 25 -xscrollcommand ".xs set" \ + -yscrollcommand ".ys set" -wrap word \ + -spacing3 2m +.text tag configure sel -relief flat -borderwidth 0 +.text tag configure doc -lmargin1 0.2i -lmargin2 0 +scrollbar .ys -orient vert -command ".text yview" +scrollbar .xs -orient horiz -command ".text xview" +bind .text { if [info exists FindPattern] DoFind} +bind .text load_file +bind .text load_file +bind .text {write_file} +bind .text {write_file} +bind .text FindDialog +bind .text FindDialog +grid .text .ys +grid .xs x +grid .text -sticky news +grid .xs -sticky we +grid .ys -sticky ns +grid columnconfigure . 0 -weight 1 +grid columnconfigure . 1 -weight 0 +grid rowconfigure . 0 -weight 1 +grid rowconfigure . 1 -weight 0 + +# Find options (All this can be tuned from dialog) +set FindMode -[option get .text findMode FindMode] ;# no -regexp for novices +set FindDir -forwards ;# Why not -backwards +set FindCase -nocase ;# Leave it empty if you want to be case sensitive +if {[option get .text findCase FindCase]} { + set FindCase "" +} + + +proc show_help {page} { + global argv0 + if [winfo exists .man] { + wm deiconify .man + raise .man + .man.text delete 0.0 end + } else { + toplevel .man -class Man + wm title .man "[file tail $argv0] help: $page" + menu .man.menu + .man.menu add cascade -label File -menu [set m [menu .man.menu.file]] + .man configure -menu .man.menu + $m add command -label Close -command {destroy .man} + text .man.text -yscrollcommand {.man.y set} + scrollbar .man.y -command {.man.text yview} -orient vert + grid .man.text .man.y -sticky news + grid columnconfigure .man 0 -weight 1 + grid columnconfigure .man 1 -weight 0 + } + .man.text insert end [exec man $page 2>/dev/null | col -b ] +} + +proc load_file {{name {}}} { +global filename +if ![string length $name] {set name [tk_getOpenFile -filetypes { +{{Msword files} .doc} +{{RTF files} .rtf} +{{MS Write files} .wri} +{{All files} *}} ]} +if ![string length $name] return +if ![file readable $name] { + return -code error "Cannot open file $name" +} +set filename $name +.mainmenu entryconfigure Encoding -state normal +.mainmenu.file entryconfigure "Save As..." -state normal +.mainmenu entryconfigure "Edit" -state normal +.mainmenu entryconfigure "Find" -state normal +reread +} + +proc make_opt {var flag} { + upvar #0 $var charset + switch $charset { + "Default" {return ""} + "unicode" {return "-u"} + default {return "$flag $charset"} + } +} +proc reread {args} { +global filename in_charset out_charset + +set inopt [make_opt in_charset -s] +set f [open "|catdoc -w $inopt -d utf-8 \"$filename\"" r] +fconfigure $f -encoding utf-8 +.text configure -state normal +.text delete 0.0 end +.text insert 0.0 [read $f] doc +.text mark set insert 1.0 +.text configure -state disabled +.text see 1.0 +if [catch {close $f} msg] { + tk_messageBox -icon error -title error -message $msg -type ok + return +} +} +proc write_file {{name {}}} { + global filename + if ![string length $name] { + set name [tk_getSaveFile -filetypes { + {{Text files} .txt} + {{LaTeX files} .tex}}] + } + if ![string length $name] return + if {[file extension $name]==".tex"} { + eval exec catdoc -t [make_opt in_charset -s] [make_opt out_charset -d]\ + [list $filename] > [list $name] + } else { + eval exec catdoc [make_opt in_charset -s] [make_opt out_charset -d]\ + [list $filename] > [list $name] + } +} +# -postcommand for Edit menu +proc EditEnable {} { +if [llength [.text tag ranges sel]] { + .mainmenu.edit entryconfigure Copy -state normal +} else { + .mainmenu.edit entryconfigure Copy -state disabled +} +} +proc CopySel {} { +clipboard clear +clipboard append -- [.text get sel.first sel.last] +} +proc FindDialog {} { +make_transient .find "Find" +frame .find.top +label .find.top.l -text "Find" +entry .find.top.e -width 30 -textvar FindPattern +bind .find.top.e ".find.b.find invoke" +pack .find.top.l .find.top.e -side left +FindOptionFrame +frame .find.b +button .find.b.find -text "Search" -command DoFind +button .find.b.close -text "Close" -command "destroy .find" +pack .find.b.find .find.b.close -side left -padx 20 +pack .find.top -pady 5 -anchor w -padx 10 +pack .find.opt -pady 10 +pack .find.b +focus .find.top.e +} +proc EnableSearch {} { +global FindPattern ReplaceString +if ![info exists FindPattern] { + .mainmenu.search entryconfigure "Find Again" -state disabled +} else { + .mainmenu.search entryconfigure "Find Again" -state normal +} +} +proc make_transient {wpath title} { +set x [expr [winfo rootx .]+[winfo width .]/3] +set y [expr [winfo rooty .]+[winfo height .]/3] +catch {destroy $wpath} +toplevel $wpath +wm transient $wpath . +wm positionfrom $wpath program +wm geometry $wpath +$x+$y +wm title $wpath $title +} +proc FindOptionFrame {} { +frame .find.opt +checkbutton .find.opt.dir -variable FindDir -onvalue -backwards\ + -offvalue -forwards -text Backward +checkbutton .find.opt.regex -variable FindMode -onvalue\ + -regex -offvalue -exact -text RegExp +checkbutton .find.opt.case -variable FindCase -onvalue -nocase -offvalue {}\ + -text "Ignore case" +pack .find.opt.dir .find.opt.regex .find.opt.case -side left +} +proc DoFind {{quiet 0}} { +global FindPattern FindMode FindDir FindCase +if ![string length $FindPattern] {return 0} +if {$FindMode=="-backwords"} { + set stopindex 0.0 +} else { + set stopindex end +} +set index [eval ".text search $FindCase $FindMode $FindDir -- \ + [list $FindPattern] insert $stopindex"] +if ![string length $index] { + if !$quiet { + tk_messageBox -type ok -title "Not found" -message "Pattern not found" + } + return 0 +} else { +.text tag remove sel 0.0 end +if {$FindMode=="-exact"} { +.text tag add sel $index "$index + [string length $FindPattern] chars" +} else { +eval "regexp $FindCase --" [list $FindPattern [.text get "$index linestart"\ + "$index lineend"] match] +.text tag add sel $index "$index + [string length $match] chars" +} +.text mark set insert sel.last +.text see $index +.text see insert +focus .text +return 1 +} +} +proc AboutDialog {} { +make_transient .about "About WordView" +message .about.m -aspect 250 -text "MS-Word viewer for UNIX +Copyright (c) by Victor B. Wagner 1997-98 +This program is distributed under +GNU General Public License Version 2 or above +Check http://www.gnu.org/copyleft/gpl.html for copying +and warranty conditions" -justify center +button .about.ok -text Ok -command {destroy .about} +pack .about.m .about.ok +} +if [llength $argv] { + if {![file exist [lindex $argv 0]]} { + puts stderr "No such file: [lindex $argv 0]" + exit 1 + } +load_file [lindex $argv 0] +} +focus .text diff --git a/src/writer.c b/src/writer.c new file mode 100644 index 0000000..5448d6c --- /dev/null +++ b/src/writer.c @@ -0,0 +1,87 @@ +/*****************************************************************/ +/* Output of unicode buffers with conversion into target encoding*/ +/* And application of substitution maps */ +/* */ +/* This file is part of catdoc project */ +/* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003 */ +/*****************************************************************/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include "catdoc.h" +/************************************************************************/ +/* performs paragraph formatting if wrap_margin is non-zero */ +/* gets character sequence and appends it to buffer. If buffer is long */ +/* enough, prints its beginning out */ +/* Input parameter is pointer to the string which represents one unicode*/ +/* character after character set translation. See convert_char ina */ +/* charset.c */ +/************************************************************************/ +static char outputbuffer[LINE_BUF_SIZE]=""; +void out_char(const char *chunk) { + static int bufpos=0; + int eol_flag=0; + const char *p; char *q; + if (!wrap_margin) { + fputs(chunk,stdout); + return; + } + + for (q=outputbuffer+bufpos,p=chunk; + *p; + *(q++)=*(p++),bufpos++) { + if (*p=='\n') eol_flag=1; + } + *q=0; + /* This strcat is safe. wrap margin setting + code in main.c ensures that wrap_margin is + less than LINE_BUF_SIZE-strlen(largest chunk) + */ + if (eol_flag) { + /* End of paragraph */ + char *q = map_subst(spec_chars,'\n'); + fputs(outputbuffer,stdout); + *outputbuffer=0; + bufpos=0; + if (q) { + fputs(q,stdout); + } else { + fputc('\n',stdout); + } + } else if (bufpos>wrap_margin) { + char *q=outputbuffer,*p=outputbuffer+wrap_margin; + + while (p>outputbuffer&&!isspace(*p)) p--; + if (p==outputbuffer) { + /*worst case - nowhere to wrap. Will use brute force */ + fwrite(outputbuffer,wrap_margin,1,stdout); + fputc('\n',stdout); + p=outputbuffer+wrap_margin; + } else { + *p=0;p++; + fputs(outputbuffer,stdout); + fputc('\n',stdout); + } + for(q=outputbuffer;*p;p++,q++) *q=*p; + bufpos=q-outputbuffer; + *q=0; + } +} + +/************************************************************************/ +/* Main output function. + * Programs which read word-processor files should accumulate paragraphs + * in the buffer as array of unicode 16-bit codes and pass to this + * function + ************************************************************************/ +void output_paragraph(unsigned short int *buffer) { + unsigned short int *p; + int countout=0; + for (p=buffer;*p;p++) { + out_char(convert_char(*p)); + countout++; + } +} diff --git a/src/xls.h b/src/xls.h new file mode 100644 index 0000000..084958d --- /dev/null +++ b/src/xls.h @@ -0,0 +1,53 @@ +/*****************************************************************/ +/* Definition specific for Excel file handling */ +/* */ +/* This file is part of catdoc project */ +/* (c) Victor Wagner 2003, (c) Alex Ott 2003 */ +/*****************************************************************/ + +#ifndef XLS_H +#define XLS_H + +#include +#include +/* types of quoting */ +#define QUOTE_NEVER 0 +#define QUOTE_SPACES_ONLY 1 +#define QUOTE_ALL_STRINGS 2 +#define QUOTE_EVERYTHING 3 + +struct rowdescr { + int last, end; + unsigned char **cells; +}; +/* structure to hold parsed table */ +extern struct rowdescr *rowptr; +extern int startrow; +/* xls2csv-specific configuration */ +extern char cell_separator; +extern int quote_mode; +void print_sheet(void); +void free_sheet(void); +void print_value(unsigned char *value); +char *format_double(char *rec,int offset,int format_code); +char *format_int(int value,int format_code); +char *format_rk(char *rec,short int format_code); +char *gettypename(long rectype); +void parse_sst(char *sstbuf,int bufsize); +void process_item (int rectype, int reclen, char *rec); +unsigned char **allocate(int row,int col); +char *copy_unicode_string(unsigned char **src); +char convert8to8(char *src,int count); +char *convert16to8(char *src,int count); +void do_table(FILE *input,char *filename); +char *mark_string(char *instr); + +/* ---- added by Igor ---- */ +void SetFormatIdxUsed(int format_code); +/* -- end added by Igor -- */ + + +#ifdef __TURBOC__ +#define rint(x) floor((x+0.5)) +#endif +#endif diff --git a/src/xls2csv.c b/src/xls2csv.c new file mode 100644 index 0000000..243e801 --- /dev/null +++ b/src/xls2csv.c @@ -0,0 +1,180 @@ +/*****************************************************************/ +/* Main program for parsing XLS files */ +/* */ +/* This file is part of catdoc project */ +/* (c) David Rysdam 1998 */ +/* (c) Victor Wagner 1998-2003, (c) Alex Ott 2003 */ +/*****************************************************************/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#include +#include "xltypes.h" +#include "catdoc.h" +#include +#include +#include "catdoc.h" +#include "float.h" +#include "xls.h" + +#ifdef __TURBOC__ +#define strcasecmp(a,b) strcmpi(a,b) +#endif +extern char *forced_date_format; +extern char number_format[]; +extern char *sheet_separator; +/************************************************************************/ +/* Displays help message */ +/************************************************************************/ +void help (void) { + printf("Usage:\n xls2csv [-xlV] [-g number] [-f date-format] [-b string] [-s charset] [-d charset] [-c char] [ -q number] files\n"); +} +/* Defines unicode chars which should be + replaced by strings before UNICODE->target chatset + mappigs are applied i.e. TeX special chars like % + */ +char *input_buffer, *output_buffer; +int main(int argc, char *argv[]) +{ + FILE *input; + FILE *new_file, *ole_file; + char *filename =NULL; + short int *tmp_charset; + int c; + int i; + char *tempname; + read_config_file(SYSTEMRC); +#ifdef USERRC + tempname=find_file(strdup(USERRC),getenv("HOME")); + if (tempname) { + read_config_file(tempname); + free(tempname); + } +#endif +#ifdef HAVE_LANGINFO + get_locale_charset(); +#endif + + check_charset(&dest_csname,dest_csname); + + while ((c=getopt(argc,argv,"Vlf:s:d:xq:c:b:g:p:"))!=-1) { + switch(c) { + case 'l': + list_charsets(); exit(0); + case 'x': + unknown_as_hex = 1; break; + case 's': + check_charset(&source_csname,optarg); + source_charset=read_charset(source_csname); + break; + case 'b': + sheet_separator= strdup(optarg); + break; + case 'd': + check_charset(&dest_csname,optarg); + break; + case 'q': + { char *errptr; + quote_mode = strtol(optarg,&errptr,0); + if ((errptr && *errptr)||quote_mode<0||quote_mode>3) { + fprintf(stderr, + "argument of -q should be number from 0 to 3\n"); + exit(1); + } + } + break; + case 'c': + cell_separator = optarg[0]; + break; + case 'f': + forced_date_format = strdup(optarg); + break; + case 'g': + { char *strend; + int digits = strtol(optarg,&strend,0); + if (*strend||digits<0||digits>DBL_DIG) { + fprintf(stderr,"value of -g option should be numbe between 0 and %d, not '%s'\n", DBL_DIG, optarg); + exit(1); + } + sprintf(number_format,"%%.%dg",digits); + } + break; + case 'V': printf("Catdoc Version %s\n",CATDOC_VERSION); + exit(0); + default: + help(); + exit(1); + } + } +/* If we are using system strftime, we need to set LC_TIME locale + * category unless choosen charset is not same as system locale + */ +#if defined(HAVE_LANGINFO) && defined(HAVE_STRFTIME) && !defined(__TURB0C__) + set_time_locale(); +#endif + /* charset conversion init*/ + input_buffer=malloc(FILE_BUFFER); + if (strcmp(dest_csname,"utf-8")) { + tmp_charset=read_charset(dest_csname); + if (!tmp_charset) { + fprintf(stderr,"Cannot load target charset %s\n",dest_csname); + exit(1); + } + target_charset=make_reverse_map(tmp_charset); + free(tmp_charset); + } else { + target_charset=NULL; + } + spec_chars=read_substmap(stradd("ascii",SPEC_EXT)); + if (!spec_chars) { + fprintf(stderr,"Cannod read substitution map ascii%s\n", + SPEC_EXT); + exit(1); + } + replacements=read_substmap(stradd("ascii",REPL_EXT)); + if (!replacements) { + fprintf(stderr,"Cannod read substitution map ascii%s\n", + REPL_EXT); + exit(1); + } + if (optind>=argc) { + if (isatty(fileno(stdin))) { + help(); + exit(0); + } + do_table(stdin,"STDIN"); + exit (0); + } + for (i=optind;iname); */ + if (res >= 0) { + if (strcasecmp(((oleEntry*)ole_file)->name , "Workbook") == 0 + || strcasecmp(((oleEntry*)ole_file)->name,"Book") == 0) { + do_table(ole_file,filename); + } + } + ole_close(ole_file); + } + set_std_func(); + ole_finish(); + fclose(new_file); + } else { + fprintf(stderr, "%s is not OLE file or Error\n", filename); + } + } + return 0; +} diff --git a/src/xlsparse.c b/src/xlsparse.c new file mode 100644 index 0000000..139bcfe --- /dev/null +++ b/src/xlsparse.c @@ -0,0 +1,777 @@ +/*****************************************************************/ +/* BIFF-stream (excel file) parsing */ +/* */ +/* This file is part of catdoc project */ +/* (c) David Rysdam 1998 */ +/* (c) Victor Wagner 1998-2003, (c) Alex Ott 2003 */ +/*****************************************************************/ +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include "xls.h" +#include "catdoc.h" +#include "xltypes.h" +#include "float.h" +#include +#include +#ifndef HAVE_STRFTIME +#include "../compat/strftime.h" +#endif +static unsigned char rec[MAX_MS_RECSIZE]; +int biff_version=0; +short int *formatTable=NULL; +char *forced_date_format = NULL; +size_t formatTableIndex = 0; +size_t formatTableSize = 0; +double date_shift = 25569.0; +#define FLT_FORMAT(a,b,c) a #b c +#define MK_FORMAT(x) FLT_FORMAT("%.",x,"g") +char number_format[8]=MK_FORMAT(DBL_DIG); + +void CleanUpFormatIdxUsed(void); + +void do_table(FILE *input,char *filename) { + long rectype; + long reclen,build_year=0,build_rel=0,offset=0; + int eof_flag=0; + int itemsread=1; + date_shift=25569.0; /* Windows 1900 date system */ + CleanUpFormatIdxUsed(); + while (itemsread) { + catdoc_read(rec,2,1,input); + biff_version=getshort(rec,0); + catdoc_read(rec,2,1,input); + reclen=getshort(rec,0); + if ( biff_version == 0x0809 || biff_version == 0x0409 || + biff_version == 0x0209 || biff_version == 0x0009 ) { + if (reclen==8 || reclen==16) { + if (biff_version == 0x0809 ) { + itemsread=catdoc_read(rec,4,1,input); + build_year=getshort(rec+2,0); + build_rel=getshort(rec,0); + if(build_year > 5 ) { + itemsread=catdoc_read(rec,8,1,input); + biff_version=8; + offset=12; + } + else { + biff_version=7; + offset=4; + } + } else if (biff_version == 0x0209 ) { + biff_version=3; + offset=2; + } else if (biff_version == 0x0409 ) { + offset=2; + biff_version=4; + } else { + biff_version=2; + } + itemsread=catdoc_read(rec,reclen-offset,1,input); + break; + } else { + fprintf(stderr,"%s: Invalid BOF record\n",filename); + return; + } + } else { + itemsread=catdoc_read(rec,126,1,input); + } + } + if (catdoc_eof(input)) { + fprintf(stderr,"%s: No BOF record found\n",filename); + exit(1); + } + while(itemsread){ + char buffer[2]; + rectype = 0; + itemsread = catdoc_read(buffer, 2, 1, input); + if (catdoc_eof(input)) { + process_item(MSEOF,0,NULL); + return; + } + + rectype=getshort(buffer,0); + if(itemsread == 0) + break; + reclen=0; + + itemsread = catdoc_read(buffer, 2, 1, input); + reclen=getshort(buffer,0); + if (reclen && reclen 0){ + itemsread = catdoc_read(rec, 1, reclen, input); + rec[reclen] = '\0'; + } + if(eof_flag) { + if (rectype != BOF) { + break; + } + } +/* fprintf(stderr,"Rectype 0x%04X reclen=%d\n",rectype, reclen); */ + process_item(rectype,reclen,rec); + if (rectype == MSEOF) { + eof_flag=1; + } else { + eof_flag=0; + } + } + return; +} +unsigned char **sst=NULL;/* Shared string table parsed into array of strings in + output encoding*/ +int sstsize = 0; /*Number of strings in SST*/ +unsigned char *sstBuffer=NULL; /*Unparsed sst to accumulate all its parts*/ +int sstBytes = 0; /*Size of SST Data, already accumulated in the buffer */ +int codepage=1251; /*default*/ +int prev_rectype=0; +/* holds a pointer to formula value, becouse value itself would be in + * next biff record + */ +unsigned char **saved_reference = NULL; + +void process_item (int rectype, int reclen, char *rec) { + if (rectype != CONTINUE && prev_rectype == SST) { + /* we have accumulated unparsed SST, and now encountered + * another record, which indicates that SST is ended */ + /* fprintf(stderr,"parse sst!\n");*/ + parse_sst(sstBuffer,sstBytes); + } + switch (rectype) { + case FILEPASS: { + fprintf(stderr,"File is encrypted\n"); + exit(69); + break; + } + case WRITEPROT: { + fprintf(stderr,"File is write protected\n"); + break; + } + + case 0x42: { + if (source_charset) break; + codepage=getshort(rec,0); + /*fprintf(stderr,"CODEPAGE %d\n",codepage); */ + if (codepage!=1200) { + const char *cp = charset_from_codepage(codepage); + source_charset=read_charset(cp); + } + break; + } + case FORMAT: { + int format_code; + format_code=getshort(rec,0); + SetFormatIdxUsed(format_code); + /* this debug code prints format string */ + /* + int i; + char *ptr; + fprintf(stderr,"Format %x \"",format_code); + if (rec[2] == reclen - 3 && rec[3] != 0) { + for (i=0,ptr=rec+3;i=sstsize|| string_no < 0 ) { + fprintf(stderr,"string index out of boundary\n"); + exit(1); + } else if (sst[string_no] !=NULL) { + int len; + char *outptr; + len=strlen(sst[string_no]); + outptr=*pcell=malloc(len+1); + strcpy(outptr,sst[string_no]); + } else { + *pcell=malloc(1); + strcpy(*pcell,""); + } + break; + } + case 0x03: + case 0x103: + case 0x303: + case NUMBER: { + int row,col; + unsigned char **pcell; + + saved_reference=NULL; + row = getshort(rec,0)-startrow; + col = getshort(rec,2); + pcell=allocate(row,col); + *pcell=strdup(format_double(rec,6,getshort(rec,4))); + break; + } + case INTEGER_CELL: { + int row,col; + unsigned char **pcell; + + row = getshort(rec,0)-startrow; + col = getshort(rec,2); + pcell=allocate(row,col); + *pcell=strdup(format_int(getshort(rec,7),getshort(rec,4))); + break; + + } + case RK: { + int row,col,format_code; + unsigned char **pcell; + + saved_reference=NULL; + row = getshort(rec,0)-startrow; + col = getshort(rec,2); + pcell=allocate(row,col); + format_code = getshort(rec,4); + *pcell=strdup(format_rk(rec+6,format_code)); + break; + } + case MULRK: { + int row,col,startcol,endcol,offset,format_code; + unsigned char **pcell; + row = getshort(rec,0)-startrow; + startcol = getshort(rec,2); + endcol = getshort(rec,reclen-2); + saved_reference=NULL; + + for (offset=4,col=startcol;col<=endcol;offset+=6,col++) { + pcell=allocate(row,col); + format_code=getshort(rec,offset); + *pcell=strdup(format_rk(rec+offset+2,format_code)); + + } + break; + } + case FORMULA: { + int row,col; + unsigned char **pcell; + saved_reference=NULL; + row = getshort(rec,0)-startrow; + col = getshort(rec,2); + pcell=allocate(row,col); + if (((unsigned char)rec[12]==0xFF)&&(unsigned char)rec[13]==0xFF) { + /* not a floating point value */ + if (rec[6]==1) { + /*boolean*/ + char buf[2]="0"; + buf[0]+=rec[9]; + *pcell=strdup(buf); + } else if (rec[6]==2) { + /*error*/ + char buf[6]="ERROR"; + *pcell=strdup(buf); + } else if (rec[6]==0) { + saved_reference=pcell; + } + } else { + int format_code=getshort(rec,4); + *pcell=strdup(format_double(rec,6,format_code)); + } + break; + } + case STRING: { + unsigned char *src=(unsigned char *)rec; + if (!saved_reference) { + fprintf(stderr,"String record without preceeding string formula\n"); + break; + } + *saved_reference=copy_unicode_string(&src); + break; + } + case BOF: { + if (rowptr) { + fprintf(stderr,"BOF when current sheet is not flushed\n"); + free_sheet(); + } + break; + } + case XF: + case 0x43: /*from perl module Spreadsheet::ParseExecel */ + { + short int formatIndex = getshort(rec,2); + /* we are interested only in format index here */ + if (formatTableIndex >= formatTableSize) { + formatTable=realloc(formatTable, + (formatTableSize+=16)*sizeof(short int)); + + if (!formatTable) { + fprintf(stderr,"Out of memory for format table"); + exit (1); + } + } + formatTable[formatTableIndex++] = formatIndex; + break; + } + case MS1904: /* Macintosh 1904 date system */ + date_shift=24107.0; + break; + + + case MSEOF: { + if (!rowptr) break; + print_sheet(); + free_sheet(); + break; + } + case ROW: { + /* fprintf(stderr,"Row! %d %d %d\n",getshort(rec,0), getshort(rec+2,0),getshort(rec+4,0)); */ + break; + } + case INDEX: { + /* fprintf(stderr,"INDEX! %d %d\n", getlong(rec+4,0), getlong(rec+8,0)); */ + break; + } + default: { +#if 0 + fprintf(stderr,"Unknown record 0x%x\n length %d\n",rectype,reclen); +#endif + } + } + prev_rectype=rectype; +} + +/* + * Extracts string from sst and returns mallocked copy of it + */ +char *copy_unicode_string (unsigned char **src) { + int count=0; + int flags = 0; + int start_offset=0; + int to_skip=0; /* ÉÓÐÏÌØÚÕÅÔÓÑ ÄÌÑ ÐÏÄÓÞÅÔÁ ÄÌÉÎÙ ÄÁÎÎÙÈ + * ÚÁ ËÏÎÃÏÍ ÓÔÒÏËÉ */ + int offset = 1; /* ÄÌÑ ÕÞÅÔÁ ÐÅÒÅÍÅÎÎÏÊ ÄÌÉÎÙ ÐÅÒ×ÏÇÏ ÐÏÌÑ */ + int charsize; + /* char *realstart=*src; */ + char *dest; /* ËÕÄÁ ÂÕÄÅÍ ËÏÐÉÒÏ×ÁÔØ ÓÔÒÏËÕ */ + char *s,*d,*c; + + int i,u,l,len; + + /* for(i=0;i<20;i++) */ + /* fprintf(stderr,"%02x ",(*src)[i]); */ + /* fprintf(stderr,"\n"); */ + + flags = *(*src+1+offset); + if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 || + flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) { + count=**src; + flags = *(*src+offset); + offset --; + flags = *(*src+1+offset); + if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 || + flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) { + /* fprintf(stderr,"Strange flags = %d, returning NULL\n", flags); */ + return NULL; + } + } + else { + count=getshort(*src,0); + } + charsize=(flags &0x01) ? 2 : 1; + + switch (flags & 12 ) { + case 0x0c: /* Far East with RichText formating */ + to_skip=4*getshort(*src,2+offset)+getlong(*src, 4+offset); + start_offset=2+offset+2+4; + /* fprintf(stderr,"Far East with RichText formating\n"); */ + break; + + case 0x08: /* With RichText formating */ + to_skip=4*getshort(*src,2+offset); + start_offset=2+offset+2; + /* fprintf(stderr,"With RichText formating %d\n",getshort(*src,2+offset)); */ + break; + + case 0x04: /* Far East */ + to_skip=getlong(*src, 2+offset); + start_offset=2+offset+4; + /* fprintf(stderr,"Far East\n"); */ + break; + + default: + to_skip=0; + start_offset=2+offset; + /* fprintf(stderr,"Default string\n"); */ + } + + /* fprintf(stderr,"count=%d skip=%d start_offset=%d\n", */ + /* count, to_skip, start_offset); */ + /* Á ÚÄÅÓØ ÍÙ ËÏÐÉÒÕÅÍ ÓÔÒÏËÕ */ + if ( (dest=malloc(count+1)) == NULL ) { + perror("Dest string alloc error"); + *src+=(to_skip+start_offset+(count*charsize)); + exit(0); + } + *src+=start_offset; + len = count; + *dest=0;l=0; + for (s=*src,d=dest,i=0;i=len) { + len+=16; + dest=realloc(dest,len+1); + } + d=dest+l; + strcpy(d,c); + d+=dl; + l+=dl; + } + } + *src=s+to_skip; + return dest; +} + + +/* + * Format code is index into format table (which is list of XF records + * in the file + * Second word of XF record is format type idnex + * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes + * date if it is not used for explicitly stored formats. + * BuiltInDateFormatIdx converts format index into index of explicit + * built-in date formats sutable for strftime. + */ +int BuiltInDateFormatIdx (int index) { + int offset; + offset=1; /* offset of date formats */ + /* 0 is used as false -- format not found */ + if ((index>= 0x0E) && (index<=0x16)) { + return offset+index-0x0E; + } else + if ((index>=0x2d) && (index<=0x2F)) { + return offset+index-0x2d+9; + } else if (index==0xa4) { + return 12+offset; + } else + return 0; +} + +/* + * GetBuiltInDateFormat stores and returns + * built in xls2csv strftime formats. + */ +#define NUMOFDATEFORMATS 13 +char *GetBuiltInDateFormat(int dateindex) { + static char *formats[]={ + /* reserved */ NULL, /* BuiltInDateFormatIdx use dateindex=0 as flag format not found */ + /* 0x0E */ "%m-%d-%y", /* 01 */ + /* 0x0F */ "%d-%b-%y", /* 02 */ + /* 0x10 */ "%d-%b", /* 03 */ + /* 0x11 */ "%b-%d", /* 04 */ + /* 0x12 */ "%l:%M %p", /* 05 */ + /* 0x13 */ "%l:%M:%S %p", /* 06 */ + /* 0x14 */ "%H:%M", /* 07 */ + /* 0x15 */ "%H:%M:%S", /* 08 */ + /* 0x16 */ "%m-%d-%y %H:%M", /* 09 */ + /* 0x2d */ "%M:%S", /* 10 */ + /* 0x2e */ "%H:%M:%S", /* 11 */ + /* 0x2f */ "%M:%S", /* 12 */ + /* 0xa4 */ "%m.%d.%Y %l:%M:%S %p" /* 13 */ + }; + if (dateindex>0 && dateindex <= NUMOFDATEFORMATS) { + return formats[dateindex]; + } else + return NULL; +} + +static char FormatIdxUsed[NUMOFDATEFORMATS]; + +void CleanUpFormatIdxUsed() { + int i; + for (i=0;i=formatTableIndex) { + fprintf(stderr,"Format code %d is used before definition\n",format_code); + return NULL; + } + + index = formatTable[format_code]; + if (IsFormatIdxUsed(index)) { + fprintf(stderr,"Format %x is redefined\n",index); + /* this format is something user-defined --- not a standard built-in date*/ + return NULL; + } + dateindex=BuiltInDateFormatIdx(index); + if (dateindex) { + if (forced_date_format) return forced_date_format; + return GetBuiltInDateFormat(dateindex); + } else + return NULL; +} + + + +time_t float2date(double d); +/* + * Extracts floating point value and formats it + */ + +char *number2string(double d,short int format_code) { + static char buffer [128]; + char *datefmt; + if ((datefmt=isDateFormat(format_code))!=NULL) { + time_t t = float2date(d); + strftime(buffer, 127,datefmt, gmtime(&t)); + } else { + sprintf(buffer,number_format,d); + } + return buffer; +} + +char *format_double(char *rec,int offset,int format_code) { + union { char cc[8]; + double d;} dconv; + char *d,*s; + int i; +# ifdef WORDS_BIGENDIAN + for(s=rec+offset+8,d=dconv.cc,i=0; + i<8;i++) *(d++)=*(--s); +# else + for(s=rec+offset,d=dconv.cc,i=0; + i<8;i++) *(d++)=*(s++); +# endif + return number2string(dconv.d,format_code); +} + +/* + * Formats integer value into static buffer + */ +char *format_int(int value,int format_code) { + static char buffer[12]; + sprintf(buffer,"%i",value); + return buffer; +} +/* + * Formats RK record + */ +char* format_rk(char *rec,short int format_code) { + double value=0.0; + int i; + + if ( *(rec) & 0x02 ) + { + value=(double)(getlong(rec,0)>>2); + } + else { + union { char cc[8]; + double d;} dconv; + char *d,*s; + for(i=0;i<8;i++) + dconv.cc[i]='\0'; +# ifdef WORDS_BIGENDIAN + for(s=rec+4,d=dconv.cc,i=0; i<4;i++) + *(d++)=*(--s); + dconv.cc[0]=dconv.cc[0] & 0xfc; +# else + for(s=rec,d=dconv.cc+4,i=0; + i<4;i++) *(d++)=*(s++); + dconv.cc[3]=dconv.cc[3] & 0xfc; +# endif + value=dconv.d; + } + if ( *(rec) & 0x01 ) + value=value*0.01; + return number2string(value,format_code); +} + + +/* + * Converts excel date into time_t + */ +time_t float2date(double f) { + /* Hacked version. Excell stores date as floating point count of days + * since 1.1.1900. or 1.1.1904 + * We are substracting value of 1.1.1970 and multiplying + * by 86400 thus getting seconds from the epoch + */ + return rint((f-date_shift)*86400); +} + +/* + * Parses SST into array of strings + */ +void parse_sst(char *sstbuf,int bufsize) { + int i; /* index into sst */ + unsigned char *curString; /* pointer into unparsed buffer*/ + unsigned char *barrier=(unsigned char *)sstbuf+bufsize; /*pointer to end of buffer*/ + unsigned char **parsedString;/*pointer into parsed array*/ + + sstsize = getlong(sstbuf+4,0); + sst=malloc(sstsize*sizeof(char *)); + + if (sst == NULL) { + perror("SST allocation error"); + exit(1); + } + memset(sst,0,sstsize*sizeof(char *)); + for (i=0,parsedString=sst,curString=sstbuf+8; + i