/* GNU Ocrad - Optical Character Recognition program
Copyright (C) 2003-2019 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include "common.h"
#include "rectangle.h"
#include "segment.h"
#include "mask.h"
#include "track.h"
#include "ucs.h"
#include "bitmap.h"
#include "blob.h"
#include "character.h"
#include "page_image.h"
#include "textline.h"
#include "textblock.h"
#include "textpage.h"
namespace {
struct Zone
{
Mask mask;
std::vector< Blob * > blobp_vector;
Zone( const Rectangle & re ) : mask( re ) {}
void join( Zone & z );
};
void Zone::join( Zone & z )
{
mask.add_mask( z.mask );
blobp_vector.insert( blobp_vector.end(),
z.blobp_vector.begin(), z.blobp_vector.end() );
z.blobp_vector.clear();
}
int blobs_in_page( const std::vector< Zone > & zone_vector )
{
int sum = 0;
for( unsigned i = 0; i < zone_vector.size(); ++i )
sum += zone_vector[i].blobp_vector.size();
return sum;
}
void bprint( const std::vector< Zone > & zone_vector, FILE * const outfile )
{
// std::fprintf( outfile, "page size %dw x %dh\n", width(), height() );
std::fprintf( outfile, "total zones in page %d\n", (int)zone_vector.size() );
std::fprintf( outfile, "total blobs in page %d\n\n", blobs_in_page( zone_vector ) );
for( unsigned zindex = 0; zindex < zone_vector.size(); ++zindex )
{
const Rectangle & r = zone_vector[zindex].mask;
const std::vector< Blob * > & blobp_vector = zone_vector[zindex].blobp_vector;
std::fprintf( outfile, "zone %d of %d\n", zindex + 1, (int)zone_vector.size() );
std::fprintf( outfile, "zone size %dw x %dh\n", r.width(), r.height() );
std::fprintf( outfile, "total blobs in zone %u\n\n",
(unsigned)zone_vector[zindex].blobp_vector.size() );
for( unsigned i = 0; i < blobp_vector.size(); ++i )
blobp_vector[i]->print( outfile );
}
}
inline void join_blobs( std::vector< Blob * > & blobp_vector,
std::vector< Blob * > & v1,
std::vector< Blob * > & v2,
Blob * p1, Blob * p2, int i )
{
if( p1->top() > p2->top() )
{
Blob * const temp = p1; p1 = p2; p2 = temp;
std::replace( v2.begin(), v2.begin() + ( i + 1 ), p2, p1 );
}
else std::replace( v1.begin() + i, v1.end(), p2, p1 );
i = blobp_vector.size();
while( --i >= 0 && blobp_vector[i] != p2 ) ;
if( i < 0 ) Ocrad::internal_error( "join_blobs, lost blob." );
blobp_vector.erase( blobp_vector.begin() + i );
p1->add_bitmap( *p2 );
delete p2;
}
void ignore_abnormal_blobs( std::vector< Blob * > & blobp_vector )
{
for( unsigned i = blobp_vector.size(); i > 0; )
{
Blob & b = *blobp_vector[--i];
if( b.height() > 35 * b.width() || b.width() > 25 * b.height() )
{
delete blobp_vector[i];
blobp_vector.erase( blobp_vector.begin() + i );
}
}
}
void ignore_small_blobs( std::vector< Blob * > & blobp_vector )
{
int to = 0, blobs = blobp_vector.size();
for( int from = 0; from < blobs; ++from )
{
Blob * const p = blobp_vector[from];
if( p->height() > 4 || p->width() > 4 ||
( ( p->height() > 2 || p->width() > 2 ) && p->area() > 5 ) )
{ blobp_vector[from] = blobp_vector[to]; blobp_vector[to] = p; ++to; }
}
if( to < blobs )
{
for( int i = to; i < blobs; ++i ) delete blobp_vector[i];
blobp_vector.erase( blobp_vector.begin() + to, blobp_vector.end() );
}
}
void remove_top_bottom_noise( std::vector< Blob * > & blobp_vector )
{
int blobs = blobp_vector.size();
for( int i = 0; i < blobs; ++i )
{
Blob & b = *blobp_vector[i];
if( b.height() < 11 ) continue;
int c = 0;
for( int col = b.left(); col <= b.right(); ++col )
if( b.get_bit( b.top(), col ) && ++c > 1 ) break;
if( c <= 1 ) b.top( b.top() + 1 );
c = 0;
for( int col = b.left(); col <= b.right(); ++col )
if( b.get_bit( b.bottom(), col ) && ++c > 1 ) break;
if( c <= 1 ) b.bottom( b.bottom() - 1 );
}
}
void remove_left_right_noise( std::vector< Blob * > & blobp_vector )
{
int blobs = blobp_vector.size();
for( int i = 0; i < blobs; ++i )
{
Blob & b = *blobp_vector[i];
if( b.width() < 6 ) continue;
int c = 0;
for( int row = b.top(); row <= b.bottom(); ++row )
if( b.get_bit( row, b.left() ) && ++c > 1 ) break;
if( c <= 1 ) b.left( b.left() + 1 );
c = 0;
for( int row = b.top(); row <= b.bottom(); ++row )
if( b.get_bit( row, b.right() ) && ++c > 1 ) break;
if( c <= 1 ) b.right( b.right() - 1 );
}
}
void find_holes( std::vector< Zone > & zone_vector )
{
for( unsigned zi = 0; zi < zone_vector.size(); ++zi )
{
std::vector< Blob * > & blobp_vector = zone_vector[zi].blobp_vector;
for( unsigned bvi = 0; bvi < blobp_vector.size(); ++bvi )
blobp_vector[bvi]->find_holes();
}
}
void ignore_wide_blobs( const Rectangle & re,
std::vector< Blob * > & blobp_vector )
{
for( unsigned i = 0; i < blobp_vector.size(); )
{
Blob & b = *blobp_vector[i];
if( 2 * b.width() < re.width() ) { ++i; continue; }
blobp_vector.erase( blobp_vector.begin() + i );
if( 4 * b.area() <= 3 * b.size() )
{
int blobs = 0;
for( unsigned j = i; j < blobp_vector.size(); ++j )
{
if( blobp_vector[j]->top() > b.bottom() ) break;
if( blobp_vector[j]->size() >= 16 ) ++blobs;
}
if( blobs <= b.size() / 400 )
{
if( 4 * b.area() <= b.size() ) // thin grid or frame
{ delete &b; continue; }
b.find_holes();
bool frame = false;
if( b.holes() < std::min( b.height(), b.width() ) )
for( int j = 0; j < b.holes(); ++j )
{
if( 4 * b.hole( j ).size() >= b.size() &&
4 * b.hole( j ).area() >= b.size() )
{ frame = true; break; }
}
if( frame ) { delete &b; continue; }
}
}
// picture, not frame
if( 5 * b.width() > 4 * re.width() && 5 * b.height() > 4 * re.height() )
{
for( unsigned j = 0; j < blobp_vector.size(); ++j )
delete blobp_vector[j];
blobp_vector.clear(); delete &b; break;
}
for( unsigned j = blobp_vector.size(); j > i; )
{
const Blob & b2 = *blobp_vector[--j];
if( b.includes( b2 ) )
{ delete &b2; blobp_vector.erase( blobp_vector.begin() + j ); }
}
delete &b;
}
}
int mean_blob_height( const std::vector< Blob * > & blobp_vector )
{
int mean_height = 0;
unsigned samples = 0;
std::vector< int > height_distrib;
for( unsigned i = 0; i < blobp_vector.size(); ++i )
{
const unsigned h = blobp_vector[i]->height();
const unsigned w = blobp_vector[i]->width();
if( h < 10 || w >= 3 * h ) continue;
if( h >= height_distrib.size() ) height_distrib.resize( h + 1 );
++height_distrib[h]; ++samples;
}
if( height_distrib.empty() )
for( unsigned i = 0; i < blobp_vector.size(); ++i )
{
const unsigned h = blobp_vector[i]->height();
if( h >= height_distrib.size() ) height_distrib.resize( h + 1 );
++height_distrib[h]; ++samples;
}
int valid_samples = 0;
for( unsigned i = 0, count = 0; i < height_distrib.size(); ++i )
{
const int a = height_distrib[i];
if( 10 * ( count + a ) >= samples && 10 * count < 9 * samples )
{ mean_height += a * i; valid_samples += a; }
count += a;
}
if( valid_samples ) mean_height /= valid_samples;
return mean_height;
}
int analyse_layout( std::vector< Blob * > & blobp_vector,
std::vector< Zone > & zone_vector )
{
if( blobp_vector.empty() ) return 0;
const int mean_height = mean_blob_height( blobp_vector );
zone_vector.push_back( Zone( *blobp_vector[0] ) );
zone_vector.back().blobp_vector.push_back( blobp_vector[0] );
for( unsigned i = 1; i < blobp_vector.size(); ++i )
{
Blob & b = *blobp_vector[i];
if( b.height() > 10 * mean_height ) { delete &b; continue; }
int first = -1;
for( unsigned j = 0; j < zone_vector.size(); ++j )
{
if( zone_vector[j].mask.distance( b ) < 2 * mean_height )
{
if( first < 0 ) first = j;
else
{
zone_vector[first].join( zone_vector[j] );
zone_vector.erase( zone_vector.begin() + j ); --j;
}
}
}
if( first >= 0 )
{
zone_vector[first].mask.add_rectangle( b );
zone_vector[first].blobp_vector.push_back( &b );
}
else
{
zone_vector.push_back( Zone( b ) );
zone_vector.back().blobp_vector.push_back( &b );
}
}
blobp_vector.clear();
// sort zone_vector
int botmax = zone_vector.empty() ? 0 : zone_vector[0].mask.bottom();
std::vector< int > cut_index_vector;
for( unsigned i = 1; i < zone_vector.size(); ++i )
{
if( zone_vector[i].mask.top() > botmax ) cut_index_vector.push_back( i );
botmax = std::max( botmax, zone_vector[i].mask.bottom() );
}
cut_index_vector.push_back( zone_vector.size() );
for( unsigned begin = 0, cut = 0; cut < cut_index_vector.size(); ++cut )
{
const unsigned end = cut_index_vector[cut];
for( unsigned i = begin; i + 1 < end; ++i )
{
unsigned first = i;
for( unsigned j = i + 1; j < end; ++j )
if( zone_vector[j].mask.precedes( zone_vector[first].mask ) )
first = j;
if( first != i ) std::swap( zone_vector[i], zone_vector[first] );
}
bool join = ( end - begin > 1 );
for( unsigned i = begin; join && i < end; ++i )
if( zone_vector[i].blobp_vector.size() > 80 ||
zone_vector[i].mask.v_distance( zone_vector[begin].mask ) >
zone_vector[i].mask.height() + zone_vector[begin].mask.height() )
join = false;
for( unsigned i = begin; join && i < end; ++i )
if( zone_vector[i].mask.height() > 4 * mean_blob_height( zone_vector[i].blobp_vector ) )
join = false;
if( join )
{
for( unsigned i = begin + 1; i < end; ++i )
zone_vector[begin].join( zone_vector[i] );
zone_vector.erase( zone_vector.begin() + ( begin + 1 ),
zone_vector.begin() + end );
for( unsigned i = cut; i < cut_index_vector.size(); ++i )
cut_index_vector[i] -= ( end - begin - 1 );
++begin;
}
else begin = end;
}
return zone_vector.size();
}
void scan_page( const Page_image & page_image, std::vector< Zone > & zone_vector,
const int debug_level, const bool layout )
{
const Rectangle & re = page_image;
const int zthreshold = page_image.threshold();
std::vector< Blob * > blobp_vector;
std::vector< Blob * > old_data( re.width(), (Blob *) 0 );
std::vector< Blob * > new_data( re.width(), (Blob *) 0 );
for( int row = re.top(); row <= re.bottom(); ++row )
{
old_data.swap( new_data );
for( int col = re.left(); col <= re.right(); ++col )
{
const int dcol = col - re.left();
if( !page_image.get_bit( row, col, zthreshold ) )
new_data[dcol] = 0; // white pixel
else // black pixel
{
Blob *p;
Blob *lp = ( (dcol > 0) ? new_data[dcol-1] : 0 );
Blob *ltp = ( (dcol > 0) ? old_data[dcol-1] : 0 );
Blob *tp = old_data[dcol];
Blob *rtp = ( (col < re.right()) ? old_data[dcol+1] : 0 );
if( lp ) { p = lp; p->add_point( row, col ); }
else if( ltp ) { p = ltp; p->add_point( row, col ); }
else if( tp ) { p = tp; p->add_point( row, col ); }
else if( rtp ) { p = rtp; p->add_point( row, col ); }
else
{
p = new Blob( col, row, col, row );
p->set_bit( row, col, true );
blobp_vector.push_back( p );
}
new_data[dcol] = p;
if( rtp && p != rtp )
join_blobs( blobp_vector, old_data, new_data, p, rtp, dcol );
}
}
}
if( debug_level <= 99 && blobp_vector.size() > 3 )
{
ignore_wide_blobs( re, blobp_vector );
ignore_small_blobs( blobp_vector );
ignore_abnormal_blobs( blobp_vector );
remove_top_bottom_noise( blobp_vector );
remove_left_right_noise( blobp_vector );
}
if( layout && re.width() > 200 && re.height() > 200 &&
blobp_vector.size() > 3 )
{
analyse_layout( blobp_vector, zone_vector );
if( debug_level <= 99 && zone_vector.size() > 1 )
for( unsigned i = 0; i < zone_vector.size(); ++i )
ignore_wide_blobs( zone_vector[i].mask, zone_vector[i].blobp_vector );
}
else
{
zone_vector.push_back( Zone( re ) );
zone_vector.back().blobp_vector.swap( blobp_vector );
}
find_holes( zone_vector );
}
} // end namespace
Textpage::Textpage( const Page_image & page_image, const char * const filename,
const Control & control, const bool layout )
: Rectangle( page_image ), name( filename )
{
const int debug_level = control.debug_level;
if( debug_level < 0 || debug_level > 100 ) return;
std::vector< Zone > zone_vector; // layout zones
scan_page( page_image, zone_vector, debug_level, layout );
if( verbosity >= 1 )
std::fprintf( stderr, "number of text blocks = %d\n", (int)zone_vector.size() );
if( debug_level >= 98 )
{
if( control.outfile ) bprint( zone_vector, control.outfile );
return;
}
if( debug_level > 95 || ( debug_level > 89 && debug_level < 94 ) ) return;
// build a Textblock for every zone with text
for( unsigned i = 0; i < zone_vector.size(); ++i )
{
Textblock * const tbp = new Textblock( page_image, zone_vector[i].mask,
zone_vector[i].blobp_vector );
if( tbp->textlines() && debug_level < 90 )
tbp->recognize( control );
if( tbp->textlines() ) tbpv.push_back( tbp );
else delete tbp;
}
if( debug_level == 0 ) return;
if( !control.outfile ) return;
if( debug_level >= 86 )
{
bool graph = ( debug_level >= 88 );
bool recursive = ( debug_level & 1 );
for( int i = 0; i < textblocks(); ++i )
tbpv[i]->dprint( control, graph, recursive );
return;
}
if( debug_level > 77 ) return;
if( debug_level >= 70 )
{
Page_image tmp( page_image );
if( ( debug_level - 70 ) & 1 ) // mark zones
for( unsigned i = 0; i < zone_vector.size(); ++i )
{
if( debug_level == 71 ) tmp.draw_mask( zone_vector[i].mask );
else tmp.draw_rectangle( zone_vector[i].mask );
}
if( ( debug_level - 70 ) & 2 ) // mark lines
{
for( int i = 0; i < textblocks(); ++i ) tbpv[i]->lmark( tmp );
}
if( ( debug_level - 70 ) & 4 ) // mark characters
{
for( int i = 0; i < textblocks(); ++i ) tbpv[i]->cmark( tmp );
}
tmp.save( control.outfile, control.filetype );
return;
}
}
Textpage::~Textpage()
{
for( int i = textblocks() - 1; i >= 0; --i ) delete tbpv[i];
}
const Textblock & Textpage::textblock( const int i ) const
{
if( i < 0 || i >= textblocks() )
Ocrad::internal_error( "Textpage::textblock, index out of bounds." );
return *(tbpv[i]);
}
int Textpage::textlines() const
{
int total = 0;
for( int i = 0; i < textblocks(); ++i )
total += tbpv[i]->textlines();
return total;
}
int Textpage::characters() const
{
int total = 0;
for( int i = 0; i < textblocks(); ++i )
total += tbpv[i]->characters();
return total;
}
void Textpage::print( const Control & control ) const
{
if( control.outfile )
for( int i = 0; i < textblocks(); ++i )
tbpv[i]->print( control );
}
void Textpage::xprint( const Control & control ) const
{
if( !control.exportfile ) return;
std::fprintf( control.exportfile, "source file %s\n", name.c_str() );
std::fprintf( control.exportfile, "total text blocks %d\n", textblocks() );
for( int i = 0; i < textblocks(); ++i )
{
const Textblock & tb = *(tbpv[i]);
std::fprintf( control.exportfile, "text block %d %d %d %d %d\n", i + 1,
tb.left(), tb.top(), tb.width(), tb.height() );
tb.xprint( control );
}
}