/* GNU Ocrad - Optical Character Recognition program Copyright (C) 2003-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include "ucs.h" int UCS::base_letter( const int code ) { switch( code ) { case CAGRAVE: case CAACUTE: case CACIRCU: case CATILDE: case CADIAER: case CARING : return 'A'; case CCCEDI : return 'C'; case CEGRAVE: case CEACUTE: case CECIRCU: case CEDIAER: return 'E'; case CGBREVE: return 'G'; case CIGRAVE: case CIACUTE: case CICIRCU: case CIDIAER: case CIDOT : return 'I'; case CNTILDE: return 'N'; case COGRAVE: case COACUTE: case COCIRCU: case COTILDE: case CODIAER: return 'O'; case CSCEDI : case CSCARON: return 'S'; case CUGRAVE: case CUACUTE: case CUCIRCU: case CUDIAER: return 'U'; case CYACUTE: case CYDIAER: return 'Y'; case CZCARON: return 'Z'; case SAGRAVE: case SAACUTE: case SACIRCU: case SATILDE: case SADIAER: case SARING : return 'a'; case SCCEDI : return 'c'; case SEGRAVE: case SEACUTE: case SECIRCU: case SEDIAER: return 'e'; case SGBREVE: return 'g'; case SIGRAVE: case SIACUTE: case SICIRCU: case SIDIAER: case SINODOT: return 'i'; case SNTILDE: return 'n'; case SOGRAVE: case SOACUTE: case SOCIRCU: case SOTILDE: case SODIAER: return 'o'; case SSCEDI : case SSCARON: return 's'; case SUGRAVE: case SUACUTE: case SUCIRCU: case SUDIAER: return 'u'; case SYACUTE: case SYDIAER: return 'y'; case SZCARON: return 'z'; default: return 0; } } int UCS::compose( const int letter, const int accent ) { switch( letter ) { case 'A': if( accent == '\'') return CAACUTE; if( accent == '`' ) return CAGRAVE; if( accent == '^' ) return CACIRCU; if( accent == ':' ) return CADIAER; break; case 'E': if( accent == '\'') return CEACUTE; if( accent == '`' ) return CEGRAVE; if( accent == '^' ) return CECIRCU; if( accent == ':' ) return CEDIAER; break; case 'G': return CGBREVE; case '[': case 'I': if( accent == '\'') return CIACUTE; if( accent == '`' ) return CIGRAVE; if( accent == '^' ) return CICIRCU; if( accent == ':' ) return CIDIAER; break; case 'N': if( accent != ':' ) return CNTILDE; break; case 'O': if( accent == '\'') return COACUTE; if( accent == '`' ) return COGRAVE; if( accent == '^' ) return COCIRCU; if( accent == ':' ) return CODIAER; break; case 'S': return CSCARON; case 'U': case 'V': if( accent == '\'') return CUACUTE; if( accent == '`' ) return CUGRAVE; if( accent == '^' ) return CUCIRCU; if( accent == ':' ) return CUDIAER; break; case 'Y': if( accent == '\'') return CYACUTE; if( accent == ':' ) return CYDIAER; break; case 'Z': return CZCARON; case 'a': if( accent == '\'') return SAACUTE; if( accent == '`' ) return SAGRAVE; if( accent == '^' ) return SACIRCU; if( accent == ':' ) return SADIAER; break; case 'e': if( accent == '\'') return SEACUTE; if( accent == '`' ) return SEGRAVE; if( accent == '^' ) return SECIRCU; if( accent == ':' ) return SEDIAER; break; case '9': case 'g': return SGBREVE; case '|': case ']': case 'i': case 'l': if( accent == '\'') return SIACUTE; if( accent == '`' ) return SIGRAVE; if( accent == '^' ) return SICIRCU; if( accent == ':' ) return SIDIAER; break; case 'n': if( accent != ':' ) return SNTILDE; break; case 'o': if( accent == '\'') return SOACUTE; if( accent == '`' ) return SOGRAVE; if( accent == '^' ) return SOCIRCU; if( accent == ':' ) return SODIAER; break; case 's': return SSCARON; case 'u': case 'v': if( accent == '\'') return SUACUTE; if( accent == '`' ) return SUGRAVE; if( accent == '^' ) return SUCIRCU; if( accent == ':' ) return SUDIAER; break; case 'y': if( accent == '\'') return SYACUTE; if( accent == ':' ) return SYDIAER; break; case 'z': return SZCARON; } return 0; } bool UCS::isalnum( const int code ) { return ( UCS::isalpha( code ) || UCS::isdigit( code ) ); } bool UCS::isalpha( const int code ) { return ( ( code < 128 && std::isalpha( code ) ) || base_letter( code ) ); } bool UCS::ishigh( const int code ) { if( isupper( code ) || isdigit( code ) ) return true; switch( code ) { case 'b': case 'd': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'p': case 'q': case 't': case 'y': case '|': return true; default : return false; } } bool UCS::islower( const int code ) { if( code < 128 && std::islower( code ) ) return true; const int base = base_letter( code ); return ( base && std::islower( base ) ); } bool UCS::islower_ambiguous( const int code ) { if( islower_small_ambiguous( code ) ) return true; switch( code ) { case 'k': case 'p': case SCCEDI: case SIGRAVE: case SIACUTE: case SICIRCU: case SIDIAER: case SOGRAVE: case SOACUTE: case SOCIRCU: case SOTILDE: case SODIAER: case SUGRAVE: case SUACUTE: case SUCIRCU: case SUDIAER: case SSCEDI: case SSCARON: case SZCARON: return true; default : return false; } } bool UCS::islower_small( const int code ) { if( code >= 128 || !std::islower( code ) ) return false; switch( code ) { case 'a': case 'c': case 'e': case 'm': case 'n': case 'o': case 'r': case 's': case 'u': case 'v': case 'w': case 'x': case 'z': return true; default : return false; } } bool UCS::islower_small_ambiguous( const int code ) { if( code >= 128 || !std::islower( code ) ) return false; switch( code ) { case 'c': case 'o': case 's': case 'u': case 'v': case 'w': case 'x': case 'z': return true; default : return false; } } bool UCS::isspace( const int code ) { return ( code < 128 && std::isspace( code ) ) || code == 0xA0; } bool UCS::isupper( const int code ) { if( code < 128 && std::isupper( code ) ) return true; const int base = base_letter( code ); return ( base && std::isupper( base ) ); } bool UCS::isupper_normal_width( const int code ) { if( code >= 128 || !std::isupper( code ) ) return false; switch( code ) { case 'I': case 'J': case 'L': case 'M': case 'Q': case 'W': return false; default : return true; } } bool UCS::isvowel( int code ) { if( code >= 128 ) code = base_letter( code ); if( !code || !std::isalpha( code ) ) return false; code = std::tolower( code ); return ( code == 'a' || code == 'e' || code == 'i' || code == 'o' || code == 'u' ); } unsigned char UCS::map_to_byte( const int code ) { if( code < 0 ) return 0; if( code < 256 ) return code; switch( code ) { case CGBREVE: return 0xD0; case SGBREVE: return 0xF0; case CIDOT : return 0xDD; case SINODOT: return 0xFD; case CSCEDI : return 0xDE; case SSCEDI : return 0xFE; case CSCARON: return 0xA6; case SSCARON: return 0xA8; case CYDIAER: return 0xBE; case CZCARON: return 0xB4; case SZCARON: return 0xB8; case EURO : return 0xA4; default : return 0; } } int UCS::map_to_ucs( const unsigned char ch ) { switch( ch ) { case 0xA4: return EURO; case 0xA6: return CSCARON; case 0xA8: return SSCARON; case 0xB4: return CZCARON; case 0xB8: return SZCARON; case 0xBC: return CLIGOE; case 0xBD: return SLIGOE; case 0xBE: return CYDIAER; } return ch; } // does not work for 'code' == 0 const char * UCS::ucs_to_utf8( const int code ) { static char s[7]; if( code < 0 || code > 0x7FFFFFFF ) { s[0] = 0; return s; } // invalid code if( code < 128 ) { s[0] = code; s[1] = 0; return s; } // plain ascii int i, mask; if( code < 0x800 ) { i = 2; mask = 0xC0; } // 110X XXXX else if( code < 0x10000 ) { i = 3; mask = 0xE0; } // 1110 XXXX else if( code < 0x200000 ) { i = 4; mask = 0xF0; } // 1111 0XXX else if( code < 0x4000000 ) { i = 5; mask = 0xF8; } // 1111 10XX else { i = 6; mask = 0xFC; } // 1111 110X s[i] = 0; --i; int d = 0; for( ; i > 0; --i, d += 6 ) s[i] = 0x80 | ( ( code >> d ) & 0x3F ); // 10XX XXXX s[0] = mask | ( code >> d ); return s; } int UCS::to_nearest_digit( const int code ) { switch( code ) { case 'D': case 'O': case 'Q': case 'o': return '0'; case 'I': case 'L': case 'l': case '|': case SINODOT: return '1'; case 'Z': case 'z': return '2'; case 'A': case 'q': return '4'; case 'S': case 's': return '5'; case 'G': case 'b': case SOACUTE: return '6'; case 'J': case 'T': return '7'; case '&': case 'B': return '8'; case 'g': return '9'; default: return code; } } int UCS::to_nearest_letter( const int code ) { switch( code ) { case '0': return 'O'; case '1': return 'l'; case '2': return 'Z'; case '4': return 'q'; case '5': return 'S'; case '6': return SOACUTE; case '7': return 'I'; case '8': return 'B'; case '9': return 'g'; default: return code; } } int UCS::to_nearest_upper_num( const int code ) { switch( code ) { case '(': case '[': return 'C'; case 'l': case '|': return 'I'; case DEG: return 'O'; case MICRO: return 'U'; case POW1: case SINODOT: return '1'; case POW2: return '2'; case POW3: return '3'; case 'q': return '4'; case 'b': case SOACUTE: return '6'; case '&': return '8'; case 'g': case MASCORD: return '9'; } if( islower_ambiguous( code ) ) return toupper( code ); return code; } int UCS::toupper( const int code ) { if( code < 128 ) return std::toupper( code ); switch( code ) { case SAGRAVE: return CAGRAVE; case SAACUTE: return CAACUTE; case SACIRCU: return CACIRCU; case SATILDE: return CATILDE; case SADIAER: return CADIAER; case SARING : return CARING; case SCCEDI : return CCCEDI; case SEGRAVE: return CEGRAVE; case SEACUTE: return CEACUTE; case SECIRCU: return CECIRCU; case SEDIAER: return CEDIAER; case SGBREVE: return CGBREVE; case SIGRAVE: return CIGRAVE; case SIACUTE: return CIACUTE; case SICIRCU: return CICIRCU; case SIDIAER: return CIDIAER; case SNTILDE: return CNTILDE; case SOGRAVE: return COGRAVE; case SOACUTE: return COACUTE; case SOCIRCU: return COCIRCU; case SOTILDE: return COTILDE; case SODIAER: return CODIAER; case SSCEDI : return CSCEDI; case SSCARON: return CSCARON; case SUGRAVE: return CUGRAVE; case SUACUTE: return CUACUTE; case SUCIRCU: return CUCIRCU; case SUDIAER: return CUDIAER; case SYACUTE: return CYACUTE; case SYDIAER: return CYDIAER; case SZCARON: return CZCARON; default: return code; } }