/* GNU Ocrad - Optical Character Recognition program Copyright (C) 2003-2019 Antonio Diaz Diaz. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include "common.h" #include "rational.h" #include "rectangle.h" #include "track.h" #include "ucs.h" #include "user_filter.h" #include "bitmap.h" #include "blob.h" #include "character.h" #include "page_image.h" #include "textline.h" #include "textblock.h" namespace { void insert_line( std::vector< Textline * > & textlinep_vector, int i ) { textlinep_vector.insert( textlinep_vector.begin() + i, new Textline ); } void delete_line( std::vector< Textline * > & textlinep_vector, int i ) { delete textlinep_vector[i]; textlinep_vector.erase( textlinep_vector.begin() + i ); } // Build the vertical composite characters. // void join_characters( std::vector< Textline * > & tlpv ) { for( unsigned current_line = 0; current_line < tlpv.size(); ++current_line ) { Textline & line = *tlpv[current_line]; for( int i = 0 ; i < line.characters() - 1; ) { Character & c1 = line.character( i ); bool joined = false; for( int j = i + 1 ; j < line.characters(); ++j ) { Character & c2 = line.character( j ); if( !c1.h_overlaps( c2 ) ) continue; Character *cup, *cdn; if( c1.vcenter() < c2.vcenter() ) cup = &c1, cdn = &c2; else cup = &c2, cdn = &c1; if( cdn->includes_hcenter( *cup ) || cup->includes_hcenter( *cdn ) || ( cdn->top() > cup->bottom() && cdn->hcenter() < cup->hcenter() ) || ( cdn->blobs() == 2 && 2 * cdn->blob( 0 ).size() < cdn->blob( 1 ).size() && cdn->blob( 0 ).includes_vcenter( *cup ) ) ) { int k; if( 64 * c1.size() < c2.main_blob().size() ) k = i; else if( 64 * c2.size() < c1.main_blob().size() ) k = j; else if( cdn == &c2 ) { c2.join( c1 ); k = i; } else { c1.join( c2 ); k = j; } line.delete_character( k ); joined = true; break; } } if( !joined ) ++i; } } } } // end namespace void Textblock::apply_filters( const Control & control ) { if( textlines() <= 0 ) return; for( unsigned f = 0; f < control.filters.size(); ++f ) { if( control.filters[f].user_filterp ) { for( int i = 0; i < textlines(); ++i ) tlpv[i]->apply_user_filter( *control.filters[f].user_filterp ); continue; } const Filter::Type filter = control.filters[f].type; if( filter != Filter::text_block ) { for( int i = 0; i < textlines(); ++i ) tlpv[i]->apply_filter( filter ); continue; } int l = right(), t = bottom(), r = left(), b = top(); for( int i = 0; i < textlines(); ++i ) { const Textline & line = *tlpv[i]; int first = line.characters(), last = -1; for( int j = line.big_initials(); j < line.characters(); ++j ) if( line.is_key_character( j ) ) { l = std::min( l, line.character(j).left() ); first = j; break; } for( int j = line.characters() - 1; j >= first; --j ) if( line.is_key_character( j ) ) { r = std::max( r, line.character(j).right() ); last = j; break; } if( i == 0 ) { for( int j = first; j <= last; ++j ) if( line.is_key_character( j ) ) t = std::min( t, line.character(j).top() ); } else if( i == textlines() - 1 ) { for( int j = first; j <= last; ++j ) if( line.is_key_character( j ) ) b = std::max( b, line.character(j).bottom() ); } } if( r < l || b < t ) continue; // can't apply filter; no text Rectangle re( l, t, r, b ); for( int i = 0; i < textlines(); ++i ) { Textline & line = *tlpv[i]; bool modified = false; for( int j = line.characters() - 1; j >= 0; --j ) if( line.character(j).height() >= 2 * line.height() || !re.includes( line.character(j).vcenter(), line.character(j).hcenter() ) ) { line.delete_character( j ); modified = true; } if( modified ) line.remove_leadind_trailing_duplicate_spaces(); } } } Textblock::Textblock( const Rectangle & page, const Rectangle & block, std::vector< Blob * > & blobp_vector ) : Rectangle( block ) { std::vector< Blob * > pending; std::vector< Blob * > pending_tall; std::vector< Blob * > pending_short; for( unsigned begin = 0, end = 0; end < blobp_vector.size(); begin = end ) { int botmax = blobp_vector[begin]->bottom(); // make cuts while( ++end < blobp_vector.size() ) { if( blobp_vector[end]->top() > botmax ) break; botmax = std::max( botmax, blobp_vector[end]->bottom() ); } // Classify blobs by height. unsigned samples = 0; std::vector< int > height_distrib; for( unsigned i = begin; i < end; ++i ) { if( blobp_vector[i]->is_abnormal() ) continue; unsigned h = blobp_vector[i]->height(); if( h >= height_distrib.size() ) height_distrib.resize( h + 1 ); ++height_distrib[h]; ++samples; } if( height_distrib.empty() ) // all blobs are abnormal for( unsigned i = begin; i < end; ++i ) { unsigned h = blobp_vector[i]->height(); if( h >= height_distrib.size() ) height_distrib.resize( h + 1 ); ++height_distrib[h]; ++samples; } int mean_height = 0; int valid_samples = 0; for( unsigned i = 0, count = 0; i < height_distrib.size(); ++i ) { int a = height_distrib[i]; if( 10 * ( count + a ) >= samples && 10 * count < 9 * samples ) { mean_height += a * i; valid_samples += a; } count += a; } if( valid_samples ) mean_height /= valid_samples; for( unsigned i = begin; i < end; ++i ) { Blob * const p = blobp_vector[i]; const bool a = p->is_abnormal(); if( p->height() >= 2 * mean_height || ( a && p->height() > mean_height ) ) pending_tall.push_back( p ); else if( 2 * p->height() <= mean_height || p->height() <= 5 || ( a && p->height() < mean_height ) ) pending_short.push_back( p ); else pending.push_back( p ); } } if( pending.empty() ) { for( unsigned i = 0; i < blobp_vector.size(); ++i ) delete blobp_vector[i]; blobp_vector.clear(); return; } blobp_vector.clear(); // Assign normal blobs to characters and create lines. { int min_line = 0; // first line of current cut tlpv.push_back( new Textline ); int current_line = min_line = textlines() - 1; tlpv[current_line]->shift_characterp( new Character( pending[0] ) ); for( unsigned i = 1; i < pending.size(); ++i ) { Blob & b = *pending[i]; current_line = std::max( min_line, current_line - 2 ); while( true ) { const Character *cl = 0, *cr = 0; for( int j = tlpv[current_line]->characters() - 1; j >= 0; --j ) { const Character & cj = tlpv[current_line]->character( j ); if( !b.includes_hcenter( cj ) && !cj.includes_hcenter( b ) ) { if( b.h_precedes( cj ) ) cr = &cj; else { cl = &cj; break; } } } if( ( cl && ( cl->includes_vcenter( b ) || b.includes_vcenter( *cl ) ) ) || ( cr && ( cr->includes_vcenter( b ) || b.includes_vcenter( *cr ) ) ) ) { tlpv[current_line]->shift_characterp( new Character( &b ) ); break; } else if( ( cl && cl->top() > b.bottom() ) || ( cr && cr->top() > b.bottom() ) ) { insert_line( tlpv, current_line ); tlpv[current_line]->shift_characterp( new Character( &b ) ); break; } else if( ( cl && cl->v_overlap_percent( b ) > 5 ) || ( cr && cr->v_overlap_percent( b ) > 5 ) ) { tlpv[current_line]->shift_characterp( new Character( &b ) ); break; } else if( ++current_line >= textlines() ) { tlpv.push_back( new Textline ); current_line = textlines() - 1; tlpv[current_line]->shift_characterp( new Character( &b ) ); break; } } } for( int i = textlines() - 1; i >= 0; --i ) if( !tlpv[i]->characters() ) delete_line( tlpv, i ); join_characters( tlpv ); // Create tracks of lines. for( int i = 0; i < textlines(); ++i ) tlpv[i]->set_track(); // Insert tall blobs. // Seek up, then seek down, needed for slanted or curved lines. current_line = 0; for( unsigned i = 0; i < pending_tall.size(); ++i ) { Blob & b = *pending_tall[i]; while( current_line > 0 && b.bottom() < tlpv[current_line]->vcenter( b.hcenter() ) ) --current_line; while( current_line < textlines() && b.top() > tlpv[current_line]->vcenter( b.hcenter() ) ) ++current_line; if( current_line >= textlines() ) { --current_line; delete &b; continue; } Textline & l = *tlpv[current_line]; const int bi = l.big_initials(); const int mh = l.mean_height(); if( b.height() <= 3 * mh && ( b.height() <= 2 * mh || l.character( bi ).left() < b.left() ) ) l.shift_characterp( new Character( &b ) ); else if( !l.characters() || l.character( std::min( bi+1, l.characters() - 1 ) ).left() > b.hcenter() ) l.shift_characterp( new Character( &b ), true ); // big initial else delete &b; } // for( int i = 0; i < textlines(); ++i ) tlpv[i]->verify_big_initials(); // Insert short blobs. // Seek up, then seek down, needed for slanted or curved lines. current_line = 0; for( unsigned i = 0; i < pending_short.size(); ++i ) { Blob & b = *pending_short[i]; while( current_line > 0 && b.bottom() < tlpv[current_line]->top( b.hcenter() ) ) --current_line; int temp = std::max( 0, current_line - 1 ); while( current_line < textlines() && b.top() > tlpv[current_line]->bottom( b.hcenter() ) ) ++current_line; if( current_line >= textlines() ) { const Textline & l = *tlpv[--current_line]; const Character *p = l.character_at( b.hcenter() ); if( b.top() > l.bottom( b.hcenter() ) + l.height() / 2 && ( !p || b.top() > p->bottom() + l.height() / 2 ) ) { delete &b; continue; } else temp = current_line; } if( current_line - temp > 1 ) temp = current_line - 1; if( current_line != temp && 2 * ( b.top() - tlpv[temp]->bottom( b.hcenter() ) ) < tlpv[current_line]->top( b.hcenter() ) - b.bottom() ) current_line = temp; tlpv[current_line]->shift_characterp( new Character( &b ) ); } // remove clipped lines at top or bottom of page if( textlines() > 2 ) { const Textline * lp = tlpv[textlines()-1]; for( int i = 0, c = 0; i < lp->characters(); ++i ) if( lp->character( i ).bottom() >= page.bottom() && 2 * ++c >= lp->characters() ) { delete_line( tlpv, textlines() - 1 ); break; } lp = tlpv[0]; const int t = std::max( page.top(), 1 ); for( int i = 0, c = 0; i < lp->characters(); ++i ) if( lp->character( i ).top() <= t && 2 * ++c >= lp->characters() ) { delete_line( tlpv, 0 ); break; } } } // Second pass. Join lines of i-dots and tildes. for( int current_line = 0; current_line < textlines() - 1; ) { bool joined = false; Textline & line1 = *tlpv[current_line]; Textline & line2 = *tlpv[current_line+1]; if( line1.characters() <= 2 * line2.characters() && 2 * line1.mean_height() < line2.mean_height() ) for( int i1 = 0; !joined && i1 < line1.characters(); ++i1 ) { Character & c1 = line1.character( i1 ); if( 2 * c1.height() >= line2.mean_height() ) continue; for( int i2 = 0; !joined && i2 < line2.characters(); ++i2 ) { Character & c2 = line2.character( i2 ); if( c2.right() < c1.left() ) continue; if( c2.left() > c1.right() ) break; if( ( c2.includes_hcenter( c1 ) || c1.includes_hcenter( c2 ) ) && c2.top() - c1.bottom() < line2.mean_height() ) { joined = true; line2.join( line1 ); delete_line( tlpv, current_line ); } } } if( !joined ) ++current_line; } join_characters( tlpv ); for( int i = 0; i < textlines(); ++i ) tlpv[i]->verify_big_initials(); // Fourth pass. Remove noise lines. if( textlines() >= 3 ) { for( int i = 0; i + 2 < textlines(); ++i ) { Textline & line1 = *tlpv[i]; Textline & line2 = *tlpv[i+1]; Textline & line3 = *tlpv[i+2]; if( line2.characters() > 2 || line1.characters() < 4 || line3.characters() < 4 ) continue; if( !Ocrad::similar( line1.height(), line3.height(), 10 ) ) continue; if( 8 * line2.height() > line1.height() + line3.height() ) continue; delete_line( tlpv, i + 1 ); } } // Remove leading and trailing noise characters. for( int i = 0; i < textlines(); ++i ) { Textline & l = *tlpv[i]; if( !l.big_initials() && l.characters() > 2 ) { const Character & c0 = l.character( 0 ); const Character & c1 = l.character( 1 ); const Character & c2 = l.character( 2 ); if( c0.blobs() == 1 && 4 * c0.size() < c1.size() && c1.left() - c0.right() > 2 * l.height() && 4 * c0.size() < c2.size() && c2.left() - c1.right() < l.height() ) l.delete_character( 0 ); } if( l.characters() > 2 ) { const Character & c0 = l.character( l.characters() - 1 ); const Character & c1 = l.character( l.characters() - 2 ); const Character & c2 = l.character( l.characters() - 3 ); if( c0.blobs() == 1 && 4 * c0.size() < c1.size() && c0.left() - c1.right() > 2 * l.height() && 4 * c0.size() < c2.size() && c1.left() - c2.right() < l.height() ) l.delete_character( l.characters() - 1 ); } } for( int i = 0; i < textlines(); ++i ) tlpv[i]->insert_spaces(); } Textblock::~Textblock() { for( int i = textlines() - 1; i >= 0; --i ) delete tlpv[i]; } void Textblock::recognize( const Control & control ) { // Recognize characters. for( int i = 0; i < textlines(); ++i ) { // First pass. Recognize the easy characters. tlpv[i]->recognize1( control.charset ); // Second pass. Use context to clear up ambiguities. tlpv[i]->recognize2( control.charset ); } apply_filters( control ); // Remove unrecognized lines. for( int i = textlines() - 1; i >= 0; --i ) { Textline & line1 = *tlpv[i]; bool recognized = false; for( int j = 0 ; j < line1.characters(); ++j ) { if( line1.character( j ).guesses() ) { recognized = true; break; } } if( !recognized ) delete_line( tlpv, i ); } // Add blank lines. if( textlines() >= 3 ) { int min_vdistance = ( tlpv.back()->mean_vcenter() - tlpv.front()->mean_vcenter() ) / ( textlines() - 1 ); for( int i = 0; i + 1 < textlines(); ++i ) { const Textline & line1 = *tlpv[i]; const Textline & line2 = *tlpv[i+1]; if( !Ocrad::similar( line1.characters(), line2.characters(), 50 ) || !Ocrad::similar( line1.width(), line2.width(), 30 ) ) continue; const int vdistance = line2.mean_vcenter() - line1.mean_vcenter(); if( vdistance >= min_vdistance ) continue; const int mh1 = line1.mean_height(), mh2 = line2.mean_height(); if( mh1 < 10 || mh2 < 10 ) continue; if( Ocrad::similar( mh1, mh2, 20 ) && 2 * vdistance > mh1 + mh2 ) min_vdistance = vdistance; } if( min_vdistance > 0 ) for( int i = 0; i + 1 < textlines(); ++i ) { const Textline & line1 = *tlpv[i]; const Textline & line2 = *tlpv[i+1]; int vdistance = line2.mean_vcenter() - line1.mean_vcenter() - min_vdistance; while( 2 * vdistance > min_vdistance ) { insert_line( tlpv, ++i ); vdistance -= min_vdistance; } } } } const Textline & Textblock::textline( const int i ) const { if( i < 0 || i >= textlines() ) Ocrad::internal_error( "line, index out of bounds." ); return *tlpv[i]; } int Textblock::characters() const { int total = 0; for( int i = 0; i < textlines(); ++i ) total += tlpv[i]->characters(); return total; } void Textblock::print( const Control & control ) const { for( int i = 0; i < textlines(); ++i ) tlpv[i]->print( control ); std::fputc( '\n', control.outfile ); } void Textblock::dprint( const Control & control, bool graph, bool recursive ) const { std::fprintf( control.outfile, "%d lines\n\n", textlines() ); for( int i = 0; i < textlines(); ++i ) { std::fprintf( control.outfile, "%d characters in line %d\n", tlpv[i]->characters(), i + 1 ); tlpv[i]->dprint( control, graph, recursive ); } std::fputc( '\n', control.outfile ); } void Textblock::xprint( const Control & control ) const { std::fprintf( control.exportfile, "lines %d\n", textlines() ); for( int i = 0; i < textlines(); ++i ) { std::fprintf( control.exportfile, "line %d chars %d height %d\n", i + 1, tlpv[i]->characters(), tlpv[i]->mean_height() ); tlpv[i]->xprint( control ); } } void Textblock::cmark( Page_image & page_image ) const { for( int i = 0; i < textlines(); ++i ) tlpv[i]->cmark( page_image ); } void Textblock::lmark( Page_image & page_image ) const { for( int i = 0; i < textlines(); ++i ) page_image.draw_track( *tlpv[i] ); }