/* Conversion of files between different charsets and surfaces. Copyright © 1996, 97, 98, 99, 00 Free Software Foundation, Inc. Contributed by François Pinard , 1996. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the `recode' Library; see the file `COPYING.LIB'. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "common.h" #include "base64.h" /* Classification of first 128 UCS-2 characters. */ #define D 1 /* set D -- direct character */ #define O 2 /* set O -- optional direct character */ #define W 4 /* direct white space */ #define z 0 static char classification[128] = { z, z, z, z, z, z, z, z, z, W, W, z, z, W, z, z, z, z, z, z, /* 0- 19 */ z, z, z, z, z, z, z, z, z, z, z, z, W, O, O, O, O, O, O, D, /* 20- 39 */ D, D, O, z, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, O, /* 40- 59 */ O, O, O, D, O, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, /* 60- 79 */ D, D, D, D, D, D, D, D, D, D, D, O, z, O, O, O, O, D, D, D, /* 80- 99 */ D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, /* 100-119 */ D, D, D, O, O, O, z, z /* 120-127 */ }; #undef z /* Should the character be directly represented in message headers? */ #define IS_HEADER_DIRECT(Character) \ (IS_ASCII (Character) && classification[Character] & (D | W)) /* Should the character be directly represented in message bodies? Here, & (D | O | W) is implied, since these are the only flags. */ #define IS_BODY_DIRECT(Character) \ (IS_ASCII (Character) && classification[Character]) /* Transformation routines. */ /* The following diagram shows the logical steps by which three UCS-2 characters get transformed into eight Base64 characters. It helps understanding shifts and masks in the transformation functions. .--------+--------. .--------+--------. .--------+--------. |aaaaaabb|bbbbcccc| |ccdddddd|eeeeeeff| |ffffgggg|gghhhhhh| `--------+--------' `--------+--------' `--------+--------' 6 2 4 4 2 6 6 2 4 4 2 6 .--------+--------+--------+--------+--------+--------+--------+--------. |00aaaaaa|00bbbbbb|00cccccc|00dddddd|00eeeeee|00ffffff|00gggggg|00hhhhhh| `--------+--------+--------+--------+--------+--------+--------+--------' .--------+--------+--------+--------+--------+--------+--------+--------. |AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|EEEEEEEE|FFFFFFFF|GGGGGGGG|HHHHHHHH| `--------+--------+--------+--------+--------+--------+--------+--------' The UCS-2 characters are divided into 6 bit chunks, which are then encoded into Base64 characters. */ static bool transform_utf16_utf7 (RECODE_SUBTASK subtask) { unsigned value; if (!get_ucs2 (&value, subtask)) SUBTASK_RETURN (subtask); while (true) if (IS_BODY_DIRECT (value)) { /* Copy one direct character. */ put_byte (value, subtask); if (!get_ucs2 (&value, subtask)) SUBTASK_RETURN (subtask); } else { /* Copy a string of non-direct characters. */ put_byte ('+', subtask); while (!IS_BODY_DIRECT (value)) { unsigned split; /* Process first UCS-2 value of a triplet. */ put_byte (base64_value_to_char[MASK (6) & value >> 10], subtask); put_byte (base64_value_to_char[MASK (6) & value >> 4], subtask); split = (value & MASK (4)) << 2; if (!get_ucs2 (&value, subtask)) { put_byte (base64_value_to_char[split], subtask); SUBTASK_RETURN (subtask); } /* Process second UCS-2 value of a triplet. */ if (IS_BODY_DIRECT (value)) { put_byte (base64_value_to_char[split], subtask); break; } put_byte (base64_value_to_char[split | (MASK (2) & value >> 14)], subtask); put_byte (base64_value_to_char[MASK (6) & value >> 8], subtask); put_byte (base64_value_to_char[MASK (6) & value >> 2], subtask); split = (value & MASK (2)) << 4; if (!get_ucs2 (&value, subtask)) { put_byte (base64_value_to_char[split], subtask); SUBTASK_RETURN (subtask); } /* Process third UCS-2 value of a triplet. */ if (IS_BODY_DIRECT (value)) { put_byte (base64_value_to_char[split], subtask); break; } put_byte (base64_value_to_char[split | (MASK (4) & value >> 12)], subtask); put_byte (base64_value_to_char[MASK (6) & value >> 6], subtask); put_byte (base64_value_to_char[MASK (6) & value], subtask); if (!get_ucs2 (&value, subtask)) SUBTASK_RETURN (subtask); } if (IS_BASE64 (value)) put_byte ('-', subtask); } SUBTASK_RETURN (subtask); } static bool transform_utf7_utf16 (RECODE_SUBTASK subtask) { int character; unsigned value; unsigned split; character = get_byte (subtask); if (character != EOF && subtask->task->byte_order_mark) put_ucs2 (BYTE_ORDER_MARK, subtask); while (character != EOF) if (character == '+') { character = get_byte (subtask); while (IS_BASE64 (character)) { /* Process first byte of first quadruplet. */ value = base64_char_to_value[character] << 10; character = get_byte (subtask); /* Process second byte of first quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); break; } value |= base64_char_to_value[character] << 4; character = get_byte (subtask); /* Process third byte of first quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); break; } split = base64_char_to_value[character]; value |= split >> 2; if (IS_BODY_DIRECT (value)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask); put_ucs2 (value, subtask); character = get_byte (subtask); /* Process fourth byte of first quadruplet. */ if (!IS_BASE64 (character)) { if (MASK (2) & split) RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); break; } value = ((MASK (2) & split) << 14 | base64_char_to_value[character] << 8); character = get_byte (subtask); /* Process first byte of second quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); break; } value |= base64_char_to_value[character] << 2; character = get_byte (subtask); /* Process second byte of second quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); break; } split = base64_char_to_value[character]; value |= split >> 4; if (IS_BODY_DIRECT (value)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask); put_ucs2 (value, subtask); character = get_byte (subtask); /* Process third byte of second quadruplet. */ if (!IS_BASE64 (character)) { if (MASK (4) & split) RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); break; } value = ((MASK (4) & split) << 12 | base64_char_to_value[character] << 6); character = get_byte (subtask); /* Process fourth byte of second quadruplet. */ if (!IS_BASE64 (character)) { RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); break; } value |= base64_char_to_value[character]; if (IS_BODY_DIRECT (value)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask); put_ucs2 (value, subtask); character = get_byte (subtask); } if (character == '-') { character = get_byte (subtask); if (!IS_BASE64 (character)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask); } } else { if (!IS_BODY_DIRECT (character)) RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask); put_byte (NUL, subtask); put_byte (character, subtask); character = get_byte (subtask); } SUBTASK_RETURN (subtask); } bool module_utf7 (RECODE_OUTER outer) { return declare_single (outer, "UTF-16", "UNICODE-1-1-UTF-7", /* RFC1642 */ outer->quality_variable_to_variable, NULL, transform_utf16_utf7) && declare_single (outer, "UNICODE-1-1-UTF-7", "UTF-16", outer->quality_variable_to_variable, NULL, transform_utf7_utf16) && declare_alias (outer, "UTF-7", "UNICODE-1-1-UTF-7") && declare_alias (outer, "TF-7", "UNICODE-1-1-UTF-7") && declare_alias (outer, "u7", "UNICODE-1-1-UTF-7") /* Simple UCS-2 does not have to go through UTF-16. */ && declare_single (outer, "ISO-10646-UCS-2", "UNICODE-1-1-UTF-7", outer->quality_variable_to_variable, NULL, transform_utf16_utf7); } void delmodule_utf7 (RECODE_OUTER outer) { }