/* GNU Ocrad - Optical Character Recognition program
Copyright (C) 2003-2019 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include "common.h"
#include "rectangle.h"
#include "track.h"
#include "ucs.h"
#include "bitmap.h"
#include "blob.h"
#include "character.h"
#include "textline.h"
// All the code in this file is provisional and will be rewritten someday
namespace {
int find_space_or_hyphen( const std::vector< Character * > & cpv, unsigned i )
{
while( i < cpv.size() && !cpv[i]->maybe(' ') && !cpv[i]->maybe('-') ) ++i;
return i;
}
} // end namespace
// transform some small letters to capitals
void Textline::check_lower_ambiguous()
{
int begin = big_initials();
bool isolated = false; // isolated letters compare with all line
for( int i = big_initials(); i < characters(); ++i )
{
Character & c1 = character( i );
if( c1.maybe(' ') )
{
if( i + 2 < characters() && character( i + 2 ).maybe(' ') )
{ begin = big_initials(); isolated = true; }
else { begin = i + 1; isolated = false; }
continue;
}
if( c1.guesses() == 1 )
{
const int code = c1.guess( 0 ).code;
if( !UCS::islower_small_ambiguous( code ) ) continue;
if( 5 * c1.height() < 4 * mean_height() ) continue;
bool capital = ( 4 * c1.height() > 5 * mean_height() );
bool small = false;
for( int j = begin; j < characters(); ++j ) if( j != i )
{
const Character & c2 = character( j );
if( !c2.guesses() ) continue;
if( c2.maybe(' ') ) { if( isolated ) continue; else break; }
const int code2 = c2.guess( 0 ).code;
if( code2 >= 128 || !std::isalpha( code2 ) ) continue;
if( !capital )
{
if( 4 * c1.height() > 5 * c2.height() ) capital = true;
else if( std::isupper( code2 ) && code2 != 'B' && code2 != 'Q' &&
( c1.height() >= c2.height() ||
Ocrad::similar( c1.height(), c2.height(), 10 ) ) )
capital = true;
else if( code2 == 't' && c1.height() >= c2.height() ) capital = true;
}
if( !small && std::islower( code2 ) && code2 != 'l' && code2 != 'j' )
{
if( 5 * c1.height() < 4 * c2.height() ) small = true;
else if( UCS::islower_small( code2 ) &&
code2 != 'r' && !c2.maybe('Q') &&
( j < i || !UCS::islower_small_ambiguous( code2 ) ) &&
Ocrad::similar( c1.height(), c2.height(), 10 ) )
small = true;
}
}
if( capital && !small ) c1.insert_guess( 0, std::toupper( code ), 1 );
}
}
}
void Textline::recognize2( const Charset & charset )
{
if( big_initials() >= characters() ) return;
// try to recognize separately the 3 overlapped blobs of an
// unrecognized character
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( !c.guesses() && c.blobs() == 3 )
{
const Blob & b1 = c.blob( 0 );
const Blob & b2 = c.blob( 1 );
const Blob & b3 = c.blob( 2 ); // lower blob
if( Ocrad::similar( b2.height(), b3.height(), 20 ) && !b2.h_overlaps( b3 ) &&
b2.v_includes( b3.vcenter() ) && b3.v_includes( b2.vcenter() ) &&
b1.bottom() < b2.top() && b1.bottom() < b3.top() )
{
if( b1.height() > b2.height() && b1.height() > b3.height() )
{
Character c1( new Blob( b1 ) );
c1.recognize1( charset, charbox( c1 ) );
if( c1.guesses() ) c = c1;
}
else
{
Character c2( new Blob( b2 ) );
Character c3( new Blob( b3 ) );
if( b2.h_includes( b1.hcenter() ) ) c2.shift_blobp( new Blob( b1 ) );
else if( b3.h_includes( b1.hcenter() ) ) c3.shift_blobp( new Blob( b1 ) );
c2.recognize1( charset, charbox( c2 ) );
c3.recognize1( charset, charbox( c3 ) );
if( c2.guesses() && c3.guesses() )
{ c = c2; shift_characterp( new Character( c3 ) ); ++i; }
}
}
}
}
// try to recognize separately the 2 overlapped blobs of an
// unrecognized character
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( !c.guesses() && c.blobs() == 2 &&
c.blob( 0 ).v_overlaps( c.blob( 1 ) ) )
{
Character c1( new Blob( c.blob( 0 ) ) );
c1.recognize1( charset, charbox( c1 ) );
Character c2( new Blob( c.blob( 1 ) ) );
c2.recognize1( charset, charbox( c2 ) );
if( ( c1.guesses() && c2.guesses() ) ||
Ocrad::similar( c1.height(), c2.height(), 20 ) )
{
if( c1.height() > c2.height() ) c = c1;
else { c = c2; c2 = c1; }
// discards spurious dots
if( !c2.maybe('.') || c2.top() > c.vcenter() )
{ shift_characterp( new Character( c2 ) ); ++i; }
}
}
}
// remove speckles under the charbox' bottom of an unrecognized character
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( !c.guesses() && c.blobs() == 2 &&
c.blob( 0 ).size() > 10 * c.blob( 1 ).size() &&
c.blob( 1 ).top() > charbox( c ).bottom() )
{
Character c1( new Blob( c.blob( 0 ) ) );
c1.recognize1( charset, charbox( c1 ) );
if( c1.guesses() ) c = c1;
}
}
// remove speckles above the charbox' top of an unrecognized character
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( !c.guesses() && c.blobs() == 2 &&
c.blob( 1 ).size() > 5 * c.blob( 0 ).size() &&
c.blob( 0 ).bottom() + 2 * c.blob( 0 ).height() < charbox( c ).top() )
{
Character c1( new Blob( c.blob( 1 ) ) );
c1.recognize1( charset, charbox( c1 ) );
if( c1.guesses() ) c = c1;
}
}
// try to separate lightly merged characters
// FIXME try relative minima (small pixel count surrounded by larger counts)
// FIXME try all possible separation points
// FIXME try other separation paths (an irregular line, not a column)
// FIXME sometimes leaves unconnected blobs
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( !c.guesses() && c.width() > 20 && 5 * c.width() >= 3 * c.height() &&
5 * c.height() >= 3 * mean_height() )
{
int ib = c.blobs() - 1;
for( int k = ib - 1; k >= 0; --k )
if( c.blob( k ).width() > c.blob( ib ).width() ) ib = k;
if( ib < 0 || 10 * c.blob( ib ).width() < 9 * c.width() ||
c.blob( ib ).bottom() < c.bottom() )
continue;
const Blob & b = c.blob( ib ); // widest blob
int colmin = 0, cmin = b.height() + 1;
for( int col = b.hpos( 30 ); col <= b.hpos( 70 ); ++col )
{
int c = 0;
for( int row = b.top(); row <= b.bottom(); ++row )
if( b.id( row, col ) ) ++c;
if( c < cmin || ( c == cmin && col <= b.hcenter() ) )
{ cmin = c; colmin = col; }
}
if( 4 * cmin > b.height() ||
( 5 * cmin > b.height() &&
( colmin <= b.hpos( 40 ) || colmin >= b.hpos( 60 ) ) ) ) continue;
if( colmin <= b.left() || colmin >= b.right() ) continue;
Rectangle r1( b.left(), b.top(), colmin - 1, b.bottom() );
Rectangle r2( colmin + 1, b.top(), b.right(), b.bottom() );
Blob b1( b, r1 );
b1.adjust_height(); if( 2 * b1.height() < b.height() ) continue;
Blob b2( b, r2 );
b2.adjust_height(); if( 2 * b2.height() < b.height() ) continue;
b1.find_holes(); b2.find_holes();
Character c1( new Blob( b1 ) );
Character c2( new Blob( b2 ) );
for( int j = 0; j < c.blobs(); ++j ) if( j != ib )
{
const Blob & bj = c.blob( j );
if( c1.includes_hcenter( bj ) ) c1.shift_blobp( new Blob( bj ) );
else if( c2.includes_hcenter( bj ) ) c2.shift_blobp( new Blob( bj ) );
}
c1.recognize1( charset, charbox( c1 ) );
c2.recognize1( charset, charbox( c2 ) );
const bool good_c2 = ( c2.guesses() && c2.guess( 0 ).code != '\'' );
if( ( c1.guesses() && good_c2 ) ||
( ( c1.guesses() || good_c2 ) && c.width() > c.height() ) )
{
c = c1; shift_characterp( new Character( c2 ) );
if( !c1.guesses() ) --i; else if( c2.guesses() ) ++i;
}
}
}
// try to recognize 1 blob unrecognized characters with holes by
// removing small holes (noise)
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( !c.guesses() && c.blobs() == 1 && c.blob( 0 ).holes() )
{
Character c1( c );
Blob & b = c1.blob( 0 );
for( int j = b.holes() - 1; j >= 0; --j )
if( 64 * b.hole( j ).size() <= b.size() ||
16 * b.hole( j ).height() <= b.height() ) b.fill_hole( j );
if( b.holes() < c.blob( 0 ).holes() )
{
c1.recognize1( charset, charbox( c1 ) );
if( c1.guesses() ) { c = c1; continue; }
}
/* if( b.holes() == 1 && 25 * b.hole( 0 ).size() < b.size() &&
Ocrad::similar( b.height(), b.width(), 40 ) )
{
b.fill_hole( 0 );
c1.recognize1( charset, charbox( c1 ) );
if( c1.guesses() ) { c = c1; continue; }
}*/
}
}
// separate merged characters recognized by recognize1
for( int i = big_initials(); i < characters(); )
{
if( !cpv[i]->guesses() || cpv[i]->guess( 0 ).code >= 0 ) { ++i; continue; }
const Character c( *cpv[i] );
const int blob_index = -(c.guess( 0 ).code + 1);
delete_character( i );
if( c.guesses() >= 3 && c.blobs() >= 1 && blob_index < c.blobs() )
{
int left = c.guess( 0 ).value;
for( int g = 1; g < c.guesses(); ++g )
{
Blob b( c.blob( blob_index ) );
Rectangle re( left, b.top(), c.guess( g ).value, b.bottom() );
b.add_rectangle( re );
Blob b1( b, re );
b1.adjust_height(); b1.adjust_width(); b1.find_holes();
Character c1( new Blob( b1 ) );
for( int k = 0; k < c.blobs(); ++k )
if( k != blob_index && !c.blob( k ).includes( re ) &&
re.includes_hcenter( c.blob( k ) ) )
c1.shift_blobp( new Blob( c.blob( k ) ) );
c1.add_guess( c.guess( g ).code, 0 );
shift_characterp( new Character( c1 ) );
left = re.right() + 1;
}
}
}
// choose between 'B' and 'a'
for( int i = big_initials(), begin = i; i < characters(); ++i )
{
Character & c1 = character( i );
if( c1.maybe(' ') ) { begin = i + 1 ; continue; }
if( c1.guesses() )
{
int code = c1.guess( 0 ).code;
if( c1.guesses() != 2 || code != 'B' || c1.guess( 1 ).code != 'a' )
continue;
if( 4 * c1.height() > 5 * mean_height() ) continue;
for( int j = begin; j < characters(); ++j ) if( j != i )
{
Character & c2 = character( j );
if( c2.maybe(' ') ) break;
if( c2.guesses() >= 1 )
{
int code2 = c2.guess( 0 ).code;
if( code2 >= 128 ) continue;
if( ( std::isupper( code2 ) && code2 != 'B' && code2 != 'Q' && 5 * c1.height() < 4 * c2.height() ) ||
( UCS::islower_small( code2 ) && code2 != 'r' && !UCS::islower_small_ambiguous( code2 ) &&
( c1.height() <= c2.height() ||
Ocrad::similar( c1.height(), c2.height(), 10 ) ) ) )
{ c1.swap_guesses( 0, 1 ); break; }
}
}
}
}
// choose between '8' and 'a' or 'e'
for( int i = big_initials(), begin = i; i < characters(); ++i )
{
Character & c1 = character( i );
if( c1.maybe(' ') ) { begin = i + 1 ; continue; }
if( c1.guesses() == 2 && c1.guess( 1 ).code == '8' )
{
int code = c1.guess( 0 ).code;
if( ( code != 'a' && code != 'e' ) || 5 * c1.height() < 4 * mean_height() )
continue;
for( int j = begin; j < characters(); ++j ) if( j != i )
{
Character & c2 = character( j );
if( c2.maybe(' ') ) break;
if( c2.guesses() >= 1 )
{
int code2 = c2.guess( 0 ).code;
if( code2 >= 128 ) continue;
if( ( ( std::isalpha( code2 ) || code2 == ':' ) && 4 * c1.height() > 5 * c2.height() ) ||
( ( std::isdigit( code2 ) || std::isupper( code2 ) || code2 == 'l' ) &&
( c1.height() >= c2.height() ||
Ocrad::similar( c1.height(), c2.height(), 10 ) ) ) )
{ c1.swap_guesses( 0, 1 ); break; }
}
}
}
}
check_lower_ambiguous();
// transform 'i' into 'j'
for( int i = big_initials(); i < characters(); ++i )
{
Character & c1 = character( i );
if( c1.guesses() == 1 && c1.guess( 0 ).code == 'i' )
{
int j = i + 1;
if( j >= characters() || !character( j ).guesses() )
{ j = i - 1; if( j < big_initials() || !character( j ).guesses() ) continue; }
Character & c2 = character( j );
if( UCS::isvowel( c2.guess( 0 ).code ) &&
c1.bottom() >= c2.bottom() + ( c2.height() / 4 ) )
c1.insert_guess( 0, 'j', 1 );
}
}
// transform small o or u with accent or diaeresis to capital
// transform small s or z with caron to capital
{
int begin = big_initials();
bool isolated = false; // isolated letters compare with all line
for( int i = big_initials(); i < characters(); ++i )
{
Character & c1 = character( i );
if( c1.guesses() >= 1 )
{
if( c1.maybe(' ') )
{
if( i + 2 < characters() && character( i + 2 ).maybe(' ') )
{ begin = big_initials(); isolated = true; }
else { begin = i + 1; isolated = false; }
continue;
}
int code = c1.guess( 0 ).code;
if( code < 128 || c1.blobs() < 2 ) continue;
int codeb = UCS::base_letter( code );
if( codeb != 'o' && codeb != 'u' && codeb != 's' && codeb != 'z' )
continue;
const Blob & b1 = c1.blob( c1.blobs() - 1 ); // lower blob
for( int j = begin; j < characters(); ++j ) if( j != i )
{
Character & c2 = character( j );
if( c2.guesses() >= 1 )
{
if( c2.maybe(' ') ) { if( isolated ) continue; else break; }
int code2 = c2.guess( 0 ).code;
int code2b = UCS::base_letter( code2 );
if( !code2b && code2 >= 128 ) continue;
if( ( std::isalpha( code2 ) && 4 * b1.height() > 5 * c2.height() ) ||
( std::isupper( code2 ) && Ocrad::similar( b1.height(), c2.height(), 10 ) ) ||
( std::isalpha( code2b ) && 4 * c1.height() > 5 * c2.height() ) ||
( std::isupper( code2b ) && Ocrad::similar( c1.height(), c2.height(), 10 ) ) )
{ c1.insert_guess( 0, UCS::toupper( code ), 1 ); break; }
}
}
}
}
}
// transform 'O' or 'l' into '0' or '1'
for( int i = big_initials(), begin = i; i < characters(); ++i )
{
Character & c1 = character( i );
if( c1.maybe(' ') ) { begin = i + 1 ; continue; }
if( c1.guesses() >= 1 )
{
int code = c1.guess( 0 ).code;
if( code != 'o' && code != 'O' && code != 'l' ) continue;
for( int j = begin; j < characters(); ++j ) if( j != i )
{
Character & c2 = character( j );
if( c2.maybe(' ') ) break;
if( c2.guesses() >= 1 )
{
int code2 = c2.guess( 0 ).code;
if( UCS::isdigit( code2 ) )
{
if( Ocrad::similar( c1.height(), c2.height(), 10 ) )
c1.insert_guess( 0, (code == 'l') ? '1' : '0', c1.guess( 0 ).value + 1 );
break;
}
if( UCS::isalpha( code2 ) &&
code2 != 'o' && code2 != 'O' && code2 != 'l' ) break;
}
}
}
}
// transform a small 'p' to a capital 'P'
for( int i = characters() - 1; i >= big_initials(); --i )
{
Character & c1 = character( i );
if( c1.guesses() == 1 && c1.guess( 0 ).code == 'p' )
{
const int noise = std::max( 2, c1.height() / 20 );
bool cap = false, valid_c2 = false;
if( i < characters() - 1 && character(i+1).guesses() )
{
Character & c2 = character( i + 1 );
int code = c2.guess( 0 ).code;
if( UCS::isalnum( code ) || code == '.' || code == '|' )
{
valid_c2 = true;
switch( code )
{
case 'g': case 'j': case 'p': case 'q': case 'y':
cap = ( c1.bottom() + noise <= c2.bottom() ); break;
case 'Q': cap = ( std::abs( c1.top() - c2.top() ) <= noise ); break;
default : cap = ( std::abs( c1.bottom() - c2.bottom() ) <= noise );
}
}
}
if( !valid_c2 && i > big_initials() && !character(i-1).maybe(' ') )
cap = ( std::abs( c1.bottom() - charbox(c1).bottom() ) <= noise );
if( cap ) c1.only_guess( 'P', 0 );
}
}
// transform a capital 'Y' to a small 'y'
for( int i = characters() - 1; i > big_initials(); --i )
{
Character & c1 = character( i - 1 );
if( c1.guesses() == 1 && c1.guess( 0 ).code == 'Y' )
{
Character & c2 = character( i );
if( !c2.guesses() ) continue;
int code = c2.guess( 0 ).code;
if( UCS::isalnum( code ) || code == '.' || code == '|' )
{
switch( code )
{
case 'g': case 'j': case 'p': case 'q': case 'y':
if( c1.bottom() < c2.bottom() - 2 ) continue; break;
case 'Q': if( c1.top() < c2.top() + 2 ) continue; break;
default : if( c1.bottom() < c2.bottom() + 2 ) continue;
}
c1.only_guess( 'y', 0 );
}
}
}
// transform a SSCEDI to a CSCEDI
if( charset.enabled( Charset::iso_8859_9 ) )
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( c.guesses() == 1 && c.guess( 0 ).code == UCS::SSCEDI )
{
if( i > big_initials() && character( i - 1 ).guesses() )
{
Character & c1 = character( i - 1 );
int code = c1.guess( 0 ).code;
if( ( UCS::islower( code ) && c.top() < c1.top() - 2 ) ||
( UCS::base_letter( code ) && code != UCS::SINODOT &&
Ocrad::similar( c.top(), c1.top(), 10 ) ) )
{ c.insert_guess( 0, UCS::CSCEDI, 1 ); continue; }
}
if( i < characters() - 1 && character( i + 1 ).guesses() )
{
Character & c1 = character( i + 1 );
int code = c1.guess( 0 ).code;
if( ( UCS::islower( code ) && c.top() < c1.top() - 2 ) ||
( UCS::base_letter( code ) && code != UCS::SINODOT &&
Ocrad::similar( c.top(), c1.top(), 10 ) ) )
{ c.insert_guess( 0, UCS::CSCEDI, 1 ); continue; }
}
}
}
// transform words like 'lO.OOO' into numbers like '10.000'
for( int begin = big_initials(), end = begin; begin < characters(); begin = end + 1 )
{
end = find_space_or_hyphen( cpv, begin );
if( end - begin < 2 ) continue;
Character & c1 = character( begin );
if( !c1.guesses() ) continue;
const int height = c1.height();
const int code1 = c1.guess( 0 ).code;
if( UCS::isdigit( code1 ) || code1 == 'l' || code1 == 'O' || code1 == 'o' )
{
int digits = 1;
int i = begin + 1;
for( ; i < end; ++i )
{
Character & c = character( i );
if( !c.guesses() ) break;
bool valid = false;
int code = c.guess( 0 ).code;
if( ( UCS::isdigit( code ) || code == 'l' || code == 'O' || code == 'o' ) &&
Ocrad::similar( c.height(), height, 10 ) )
{ valid = true; ++digits; }
if( code == '.' || code == ',' || code == ':' || code == '+' || code == '-' )
valid = true;
if( !valid ) break;
}
if( i >= end && digits >= 2 )
for( i = begin; i < end; ++i )
{
Character & c = character( i );
int code = c.guess( 0 ).code;
if( code == 'l' ) code = '1';
else if( code == 'O' || code == 'o' ) code = '0';
else code = 0;
if( code ) c.insert_guess( 0, code, c.guess( 0 ).value + 1 );
}
}
}
// detects Roman numerals 'II', 'III' and 'IIII'
for( int begin = big_initials(), end = begin; begin < characters(); begin = end + 1 )
{
end = find_space_or_hyphen( cpv, begin );
if( end - begin < 2 || end - begin > 4 ) continue;
const int height = character( begin ).height();
int i;
for( i = begin; i < end; ++i )
{
Character & c = character( i );
if( !c.maybe('|') || !Ocrad::similar( c.height(), height, 10 ) ) break;
}
if( i >= end )
for( i = begin; i < end; ++i )
{
Character & c = character( i );
if( !character(i).maybe('I') )
c.insert_guess( 0, 'I', c.guess( 0 ).value + 1 );
}
}
// choose between 'a' and 'Q'
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( c.guesses() == 2 &&
c.guess( 0 ).code == 'a' && c.guess( 1 ).code == 'Q' )
{
if( 4 * c.height() > 5 * mean_height() )
{ c.swap_guesses( 0, 1 ); check_lower_ambiguous(); continue; }
if( i < characters() - 1 && character( i + 1 ).guesses() )
{
const int code = character( i + 1 ).guess( 0 ).code;
if( ( UCS::ishigh( code ) ) &&
10 * c.height() > 9 * character( i + 1 ).height() )
{ c.swap_guesses( 0, 1 ); check_lower_ambiguous(); continue; }
}
if( i > big_initials() && character( i - 1 ).guesses() )
{
const int code = character( i - 1 ).guess( 0 ).code;
if( ( UCS::ishigh( code ) ) &&
10 * c.height() > 9 * character( i - 1 ).height() )
{ c.swap_guesses( 0, 1 ); check_lower_ambiguous(); }
}
}
}
// transform a vertical bar into 'l' or 'I' (or a 'l' into an 'I')
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( c.guesses() != 1 ) continue;
int code = c.guess( 0 ).code;
if( code == '|' || code == 'l' )
{
int lcode = 0, rcode = 0;
if( i > 0 && character( i - 1 ).guesses() )
lcode = character( i - 1 ).guess( 0 ).code;
if( i < characters() - 1 && character( i + 1 ).guesses() )
rcode = character( i + 1 ).guess( 0 ).code;
if( ( UCS::isupper( rcode ) || UCS::isdigit( rcode ) ) &&
( !lcode || UCS::isupper( lcode ) || !UCS::isalnum( lcode ) ) )
{ c.insert_guess( 0, 'I', 1 ); continue; }
if( code == 'l' ) continue;
if( UCS::isalpha( lcode ) || UCS::isalpha( rcode ) )
{ c.insert_guess( 0, 'l', 1 ); continue; }
if( rcode == '|' && ( !lcode || !UCS::isalnum( lcode ) ) )
{
if( i < characters() - 2 && character( i + 2 ).guesses() &&
UCS::isalpha( character( i + 2 ).guess( 0 ).code ) )
{ c.insert_guess( 0, 'l', 1 ); continue; }
if( i >= 2 && character( i - 2 ).guesses() &&
UCS::isalpha( character( i - 2 ).guess( 0 ).code ) )
{ c.insert_guess( 0, 'l', 1 ); continue; }
}
}
}
// transform a vertical bar into 'I' at end of word
for( int begin = big_initials(), end = begin; begin < characters(); begin = end + 1 )
{
end = find_space_or_hyphen( cpv, begin );
if( end - begin < 3 ) continue;
Character & ce = character( end - 1 );
if( !ce.maybe('|') || ce.maybe('I') ) continue;
const int height = ce.height();
int i;
for( i = begin; i < end - 1; ++i )
{
const Character & c = character( i );
if( !c.guesses() ) break;
const int code = c.guess( 0 ).code;
if( ( !UCS::isupper( code ) && !UCS::isdigit( code ) ) ||
!Ocrad::similar( c.height(), height, 10 ) ) break;
}
if( i >= end - 1 )
ce.insert_guess( 0, 'I', ce.guess( 0 ).value + 1 );
}
// transform 'l' or '|' into UCS::SINODOT
if( charset.enabled( Charset::iso_8859_9 ) )
for( int i = big_initials(), begin = i; i < characters(); ++i )
{
Character & c1 = character( i );
if( c1.maybe(' ') ) { begin = i + 1 ; continue; }
if( c1.guesses() )
{
int code = c1.guess( 0 ).code;
if( code != 'l' && code != '|' ) continue;
if( 4 * c1.height() > 5 * mean_height() ) continue;
if( 5 * c1.height() < 4 * mean_height() )
{ c1.only_guess( UCS::SINODOT, 0 ); continue; }
bool capital = false, small = false;
for( int j = begin; j < characters(); ++j ) if( j != i )
{
Character & c2 = character( j );
if( c2.maybe(' ') ) break;
if( !c2.guesses() ) continue;
int code2 = c2.guess( 0 ).code;
if( code2 >= 128 || !std::isalpha( code2 ) ) continue;
if( !capital )
{
if( 4 * c1.height() > 5 * c2.height() ) capital = true;
else if( std::isupper( code2 ) && code2 != 'B' && code2 != 'Q' &&
( c1.height() >= c2.height() ||
Ocrad::similar( c1.height(), c2.height(), 10 ) ) )
capital = true;
}
if( !small && std::islower( code2 ) && code2 != 'l' )
{
if( 5 * c1.height() < 4 * c2.height() ) small = true;
else if( UCS::islower_small( code2 ) &&
( j < i || !UCS::islower_small_ambiguous( code2 ) ) &&
Ocrad::similar( c1.height(), c2.height(), 10 ) )
small = true;
}
}
if( !capital && small ) c1.insert_guess( 0, UCS::SINODOT, 1 );
}
}
// join two adjacent single quotes into a double quote
for( int i = big_initials(); i < characters() - 1; ++i )
{
Character & c1 = character( i );
Character & c2 = character( i + 1 );
if( c1.guesses() == 1 && c2.guesses() == 1 )
{
int code1 = c1.guess( 0 ).code;
int code2 = c2.guess( 0 ).code;
if( ( code1 == '\'' || code1 == '`' ) && code1 == code2 &&
2 * ( c2.left() - c1.right() ) < 3 * c1.width() )
{ c1.join( c2 ); c1.only_guess( '"', 0 ); delete_character( i + 1 ); }
}
}
// join a comma followed by a period into a semicolon
for( int i = big_initials(); i < characters() - 1; ++i )
{
Character & c1 = character( i );
Character & c2 = character( i + 1 );
if( c1.guesses() == 1 && c2.guesses() == 1 )
{
int code1 = c1.guess( 0 ).code;
int code2 = c2.guess( 0 ).code;
if( code1 == ',' && code2 == '.' && c1.top() > c2.bottom() &&
c2.left() - c1.right() < c2.width() )
{ c1.join( c2 ); c1.only_guess( ';', 0 ); delete_character( i + 1 ); }
}
}
// choose between '.' and '-'
if( characters() >= 2 )
{
Character & c = character( characters() - 1 );
if( c.guesses() >= 2 &&
c.guess( 0 ).code == '.' && c.guess( 1 ).code == '-' )
{
const Character & lc = character( characters() - 2 );
if( lc.guesses() && UCS::isalpha( lc.guess( 0 ).code ) )
c.swap_guesses( 0, 1 );
}
}
// join a 'n' followed by a 'I' into a 'm'
for( int i = big_initials(); i < characters() - 1; ++i )
{
Character & c1 = character( i );
Character & c2 = character( i + 1 );
if( c1.guesses() == 1 && c2.guesses() == 1 )
{
int code1 = c1.guess( 0 ).code;
int code2 = c2.guess( 0 ).code;
if( code1 == 'n' && ( code2 == 'I' || code2 == 'l' ) &&
Ocrad::similar( c1.height(), c2.height(), 10 ) &&
c2.left() - c1.right() < c2.width() )
{ c1.join( c2 ); c1.only_guess( 'm', 0 ); delete_character( i + 1 ); }
}
}
// separate merged 'VV'
{
int mean_upper_width = 0;
for( int i = big_initials(); i < characters(); ++i )
{
Character & c = character( i );
if( !c.guesses() || c.guess( 0 ).code != 'W' || c.width() <= c.height() ||
c.blobs() != 1 || c.blob( 0 ).holes() ) continue;
if( mean_upper_width == 0 )
{
int count = 0;
for( int j = big_initials(); j < characters(); ++j )
{
const Character & cj = character( j );
if( cj.guesses() && UCS::isupper_normal_width( cj.guess( 0 ).code ) )
{ mean_upper_width += cj.width(); ++count; }
}
if( count <= 0 ) break; // no characters to compare
mean_upper_width /= count;
}
if( c.width() < 2 * mean_upper_width ) continue;
const Blob & b = c.blob( 0 );
int row = b.bottom();
while( row >= b.top() && b.id( row, b.hcenter() ) == 0 ) --row;
if( row >= b.vpos( 20 ) ) continue;
Rectangle r1( b.left(), b.top(), b.hcenter() - 1, b.bottom() );
Rectangle r2( b.hcenter() + 1, b.top(), b.right(), b.bottom() );
Blob b1( b, r1 );
Blob b2( b, r2 );
b1.adjust_height();
b2.adjust_height();
if( 2 * b1.height() < b.height() || 2 * b2.height() < b.height() ||
!Ocrad::similar( b1.height(), b2.height(), 10, 2 ) ) continue;
Character c1( new Blob( b1 ) );
Character c2( new Blob( b2 ) );
c1.only_guess( 'V', 0 );
c2.only_guess( 'V', 0 );
c = c1;
++i; cpv.insert( cpv.begin() + i, new Character( c2 ) );
}
}
// join the secuence '°', '/', 'o', ' ' into a '%'
for( int i = big_initials(); i + 2 < characters(); ++i )
{
Character & c1 = character( i );
if( c1.guesses() == 1 && c1.guess( 0 ).code == UCS::DEG )
{
if( character( i + 1 ).maybe('/') &&
character( i + 2 ).maybe('o') &&
( i + 3 >= characters() || character( i + 3 ).maybe(' ') ) )
{
c1.join( character( i + 1 ) ); c1.join( character( i + 2 ) );
delete_character( i + 2 ); delete_character( i + 1 );
c1.only_guess( '%', 0 );
}
}
}
}