/* Conversion of files between different charsets and surfaces. Copyright © 1999, 2000, 2001 Free Software Foundation, Inc. Contributed by François Pinard , 1999, and Bruno Haible , 2000. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the `recode' Library; see the file `COPYING.LIB'. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "common.h" #include "iconv.h" #include "libiconv.h" /*-----------------------------------------. | Use `libiconv' to handle a double step. | `-----------------------------------------*/ #define LIBICONV_BUFFER_SIZE 2048 static bool wrapped_transform (iconv_t conversion, iconv_t conversion_to_utf8, RECODE_SUBTASK subtask) { int input_char = get_byte (subtask); char input_buffer[LIBICONV_BUFFER_SIZE]; char output_buffer[LIBICONV_BUFFER_SIZE]; size_t input_left; size_t output_left; const char *input; char *output; char *cursor; int transliterate; size_t converted; int saved_errno; /* libiconv has transliteration facilities. Since they are not reversible, try first without them. */ transliterate = 0; iconvctl (conversion, ICONV_SET_TRANSLITERATE, &transliterate); iconvctl (conversion_to_utf8, ICONV_SET_TRANSLITERATE, &transliterate); cursor = input_buffer; while (cursor > input_buffer || input_char != EOF) { /* Fill the input buffer as much as possible. */ while (input_char != EOF && cursor < input_buffer + LIBICONV_BUFFER_SIZE) { *cursor++ = input_char; input_char = get_byte (subtask); } /* We are now sure to have at least some input. */ if (!(cursor > input_buffer)) { recode_error (subtask->task->request->outer, "libiconv.c internal error 56"); SET_SUBTASK_ERROR (RECODE_INTERNAL_ERROR, subtask); SUBTASK_RETURN (subtask); } assert (cursor > input_buffer); /* Convert accumulated input into the output buffer. */ input_left = cursor - input_buffer; input = input_buffer; output_left = LIBICONV_BUFFER_SIZE; output = output_buffer; converted = iconv (conversion, &input, &input_left, &output, &output_left); /* Send the converted result, to free the output buffer. */ saved_errno = errno; for (cursor = output_buffer; cursor < output; cursor++) put_byte (*cursor, subtask); errno = saved_errno; /* Analyze the iconv return value. */ if (converted == (size_t)(-1) && errno != E2BIG) { if (errno == EILSEQ) { /* The first thing to try is libiconv's built-in transliteration. */ if (!transliterate) { /* Fail if the user requested reversible conversions. */ RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask); /* Switch transliteration on for the rest of the subtask. */ transliterate = 1; iconvctl (conversion, ICONV_SET_TRANSLITERATE, &transliterate); iconvctl (conversion_to_utf8, ICONV_SET_TRANSLITERATE, &transliterate); /* Continue, shifting the remaining input buffer. */ goto shift_input; } else { /* An invalid multibyte sequence was encountered in the input, or a conversion error occurred. Distinguish the two cases by use of conversion_to_utf8. In the first case, skip one byte. In the second case, skip the entire character. FIXME: This heuristic does not work well with stateful encodings like ISO-2022-JP. */ char tmp_buf[6]; size_t tmp_input_left; size_t tmp_output_left; const char *tmp_input; char *tmp_output; RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); assert (input_left > 0); tmp_input_left = input_left; tmp_input = input; tmp_output_left = sizeof (tmp_buf); tmp_output = tmp_buf; iconv (conversion_to_utf8, &tmp_input, &tmp_input_left, &tmp_output, &tmp_output_left); /* Reset conversion_to_utf8 to the initial state. */ iconv (conversion_to_utf8, NULL, NULL, NULL, NULL); if (tmp_input > input) { /* Conversion error. Skip the entire character. */ input = tmp_input; input_left = tmp_input_left; } else { /* Invalid input. Skip one byte. */ input++; input_left--; } /* Reset conversion state. (Why?) */ output_left = LIBICONV_BUFFER_SIZE; output = output_buffer; converted = iconv (conversion, NULL, NULL, &output, &output_left); /* We don't expect E2BIG here: the buffer is large enough. */ assert (converted != (size_t)(-1)); for (cursor = output_buffer; cursor < output; cursor++) put_byte (*cursor, subtask); } } else if (errno == EINVAL) { /* Incomplete multibyte sequence. */ if (input + input_left < input_buffer + LIBICONV_BUFFER_SIZE && input_char == EOF) { /* Incomplete multibyte sequence at end of input. */ RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); break; } /* Otherwise, we shift the remaining input and see whether the error persists in the next round. */ } else { recode_perror (subtask->task->request->outer, "libiconv ()"); SET_SUBTASK_ERROR (RECODE_SYSTEM_ERROR, subtask); SUBTASK_RETURN (subtask); } } /* If there was no progress, we have a bug in either libiconv or the above logic. */ if (!(input > input_buffer)) { recode_error (subtask->task->request->outer, "libiconv.c internal error 154"); SET_SUBTASK_ERROR (RECODE_INTERNAL_ERROR, subtask); SUBTASK_RETURN (subtask); } assert (input > input_buffer); shift_input: /* Shift back the unconverted part of the input buffer. memcpy() doesn't do here, because the regions might overlap. memmove() isn't worth it, because we rarely have to move more than 12 bytes. */ cursor = input_buffer; if (input_left > 0) { if (input > input_buffer) { do *cursor++ = *input++; while (--input_left > 0); } else cursor += input_left; } } /* Drain all accumulated partial state and emit output to return to the initial shift state. */ output_left = LIBICONV_BUFFER_SIZE; output = output_buffer; converted = iconv (conversion, NULL, NULL, &output, &output_left); /* We don't expect E2BIG here: the buffer is large enough. */ assert (converted != (size_t)(-1)); for (cursor = output_buffer; cursor < output; cursor++) put_byte (*cursor, subtask); SUBTASK_RETURN (subtask); } bool transform_with_libiconv (RECODE_SUBTASK subtask) { RECODE_CONST_STEP step = subtask->step; iconv_t conversion = iconv_open (step->after->name, step->before->name); iconv_t conversion_to_utf8 = iconv_open ("UTF-8", step->before->name); bool status; if (conversion == (iconv_t) -1 || conversion_to_utf8 == (iconv_t) -1) { SET_SUBTASK_ERROR (RECODE_SYSTEM_ERROR, subtask); SUBTASK_RETURN (subtask); } status = wrapped_transform (conversion, conversion_to_utf8, subtask); iconv_close (conversion); iconv_close (conversion_to_utf8); return status; } /*---------------------------------------------------------. | Declare all character sets which `libiconv' may handle. | `---------------------------------------------------------*/ bool module_libiconv (RECODE_OUTER outer) { const char **cursor; for (cursor = iconv_name_list; *cursor; cursor++) { const char **aliases = cursor; const char *charset_name = *cursor; /* Scan aliases for some charset which would already be known. If any, use its official name as a charset. Else, use the first alias. */ while (*cursor) { RECODE_ALIAS alias = find_alias (outer, *cursor, ALIAS_FIND_AS_CHARSET); if (alias) { charset_name = alias->symbol->name; break; } cursor++; } if (!declare_libiconv (outer, charset_name)) return false; /* Declare all aliases, given they bring something we do not already know. Even then, we still declare too many useless aliases, as the desambiguating tables are not recomputed as we go. FIXME! */ for (cursor = aliases; *cursor; cursor++) { RECODE_ALIAS alias = find_alias (outer, *cursor, ALIAS_FIND_AS_CHARSET); /* If there is a charset contradiction, call declare_alias nevertheless, as the error processing will occur there. */ if (!alias || alias->symbol->name != charset_name) if (!declare_alias (outer, *cursor, charset_name)) return false; } } return true; } void delmodule_libiconv (RECODE_OUTER outer) { }