#! /usr/bin/perl -w
#
# gdocimagecheck.pl
#
# Revision date: 8/24/2011
#
#
# This script performs quality control automatically on a 
# series of folders of scanned government documents.
# 
# This script is run after gdocblankcheck.pl. It runs a double-check
#of pagination in order to confirm page amounts against a MARC #record/textfile. 
#It then compares each image of the file #against a known good reference sample. 
#Images that are blanks are compared against an array of known blanks. 
#Images that are evaluated as too dark or too light are noted in a suspect textfile
# and the folders are moved into a SUSPECT folder for later
# evaluation via CGI script


# Doreva Belfiore
# Rutgers University Law Library
# Camden, NJ

#1 Load required third-party modules

use Image::Magick;
use Roman;

#2 Set defined variables here

my($document, $blank);
my($rg, $prefmat, $rs, $standard);
my($rp, $fixer, $ff);
my($rb, $body, $ra, $average);
my @okblanks = ();

#opens the logfiles

open (ERR, ">>gdocblankcheck.txt");
open (ERR2, ">>gdocsuspect.txt");
open (ERR3, ">>gdocproblem.txt");
open (ERR4, ">>gdocgood.txt");

#3 Load the initial reference information

$gdir = ("./reference");
chdir ("$gdir");

print "Loading reference information now. Please wait... \n";

#Instantiates an image object in ImageMagick.

$gdir = ("./reference");
chdir ("$gdir");

#3a Blank File
            
$blank = new Image::Magick();   
$rb = $blank->Read('blank.tif');
die "$rb" if $rb;
            
#This command shaves 0 pixels off the top/bottom and 
# 500  pixels off the sides so that any black edges
#of the image will not skew the measurement
#to get an accurate brightness level.
            
$rb = $blank->Shave(geometry=>'500x10');
$rb = $blank->Set(page=>'0x0+0+0');
#test write to view
#$rb = $blank->Write(filename=>'blank_trim.tif', compression=>'None');
            
#Gathers statistics about the blank page file
            
@bstats = $blank->Statistics();
            
$blmean = $bstats[3];                  
print "The mean of the blank file is: $blmean \n";
                              
undef $rb;                             
undef $blank;

#3b Clean "Standard" file

$standard = new Image::Magick();
$rs = $standard->Read('standard.tif');
die "$rs" if $rs;

#This command shaves 0 pixels off the top/bottom and 
# 100  pixels off the sides so that any black edges  
#of the image will not skew the measurement
#to get an accurate brightness level.

$rs = $standard->Shave(geometry=>'500x10');
$rs = $standard->Set(page=>'0x0+0+0');
#test write to view
#$rs = $standard->Write(filename=>'standard_trim.tif', compression=>'None');

#Gathers statistics about the clean file

@sstats = $standard->Statistics();

$smean = $sstats[3];
print "The mean of the clean standard file is:   $smean \n";


#4 Here were are going to count the total number of pages found in the directory first and #account for any blanks.

chdir "..";

$rdir = ("../INSERT SERVER PATH HERE/IMAGECHECK");

opendir (DIR, "$rdir");
@lccns = grep /\d+/, readdir DIR;
closedir DIR;
@lccns = sort(@lccns);
             
print  "GOT LCCNS @lccns \n";
                          

#5 Gathering pages (grep)

MAIN: foreach $doc (@lccns){

#test - may need
chdir "$rdir";             
print "\nInitiating quality control check for $doc now . . .\n";
print "\nCounting total number of pages for  $doc . . . \n";


opendir DIR2, "$doc";
@files = grep /\.tif/, readdir DIR2;
closedir DIR2;
@files = sort(@files);
$fnumber = @files;
                                
#test
print "Found $fnumber files in the directory $doc. \n";
                                                                        

#6  Test for multivolume


chdir "$doc";

if ($doc =~ m/([0-9]{1,8})([a-z]{1,2})/) {
print "Found a multivolume! \n";
$mvflag = 1;
}

else {
print "Not a multivolume. Read the regular MARC file. \n";
&readmarc;  
$mvflag = 0;
}


if ($mvflag == 1) {
print "Reading outfile instead of MARC record. \n";
&readoutfile;
}


#7 Evaluate number of pages stated in MARC record against the number of pages
# found in the folder.

print "Matching the number of pages found in the folder against the MARC record for $doc . . .\n";

# First we check that the preface and body pages have been identified

CHECK1: 
if (defined ($pnum)) {
  print "$pnum \n";
}
  else {
  &nopref;
  }

CHECK2: 
if (defined ($bnum)) {
  print "$bnum \n";
  }
  else {
  &nobody;
  }

#8 Test for match of page numbers

CHECK3:

  if ($tnum == $fnumber) {
    &qualcheck;
    }
  elsif ($tnum != $fnumber) {
    &checkprefevenodd;
    }
  else {
    print "Failure. Keep troubleshooting. \n";
    }
   

}	#end MAIN


#10  Undefine variables at the end as final precaution

undef $document;
undef $rg;
undef $prefmat;
undef $rp;
undef $fixer;
undef $ff;
undef $rb;
undef $body;
undef $average;  
undef $fnumber;
undef $tnum;
undef $bnum;

for (keys %okblankhash)
    {
        delete $okblankhash{$_};
    }

#close the error and logfiles 
close ERR;
close ERR2;
close ERR3;
close ERR4;

########SUBROUTINES###########

sub checkprefevenodd {

#Checking even or odd numbers as this makes a difference
#Uses modulus function %

#Checks the prefatory material
$podd_num = $pnum % 2;
if ($podd_num) {
   print "Prefatory page $pnum is odd. \n"; 
   $pnum = ($pnum + 1);
   $pflag = 1;
   }
else { 
  print "Prefatory page $pnum is even. \n"; 
  $pflag = 0;
  }

#print "The pflag is now  $pflag \n";

#Next check
&checkbodyevenodd;

}  #end sub checkprefevenodd


sub checkbodyevenodd {

#Checks the body pages
$bodd_num = $bnum % 2; 
if ($bodd_num) { 
  print "Body page $bnum is odd. \n"; 
  $bnum = ($bnum + 1);
  $bflag = 1;
  }
else { 
  print "Body page $bnum is even. \n";
  $bflag = 0;
  }

#print "The bflag is now $bflag \n";

#Next check
&checkprefblanks;

}  #end sub checkbodyevenodd



sub checkprefblanks {

print "Checking document $doc for any blank pages after the prefatory material \n";
print "or after the body pages. \n";

#Checks the prefatory material to see if there is a trailing blank page
#after an odd number of pages

#Find the suspect preface file
$findit = $pnum-1;
$badpref = @files[$findit];

print "$badpref is a suspect file. \n";

#Instantiates an image object with ImageMagick.        

$prefmat = new Image::Magick();
$rp = $prefmat->Read("$badpref");
die "$rp" if $rp;

print "Checking image $badpref for blank status . . .\n";

#This command shaves 100 pixels off the top/bottom  and 
# 500 off the sides so that it evaluates the middle
#of the image and not the edges to get an accurate brightness level.
$rp = $prefmat->Shave(geometry=>'500x100');
$rp = $prefmat->Set(page=>'0x0+0+0');

#Gathers statistics about the current file
@avstats = $prefmat->Statistics();

$pmean = $avstats[3];
$diff = ($blmean - $pmean);  
print "The difference between $badpref and the reference file is $diff \n";

undef $rp;
undef $prefmat;


if ($diff < 0.03) {
  print "The file $badpref is blank and will be skipped. \n";
  push @okblanks, $badpref;
  $pbflag = 1;
  }
  
                                  
#print "The pbflag is now $pbflag \n";

#recalculate
$newtnum = ($pnum + $bnum);
print "$newtnum is the new total \n";
if (($newtnum == $fnumber) && ($pbflag = 1)){
  &qualcheck;
  }
else {
  &checkbodyblanks;
  }


} # end sub checkpref                                


sub checkbodyblanks {

#Checks the body if there is a trailing blank page
#especially after an odd number of pages

#Find the suspect body file

#Note array starts with zero, so bad file will = $tnum not 
#$tnum +1 in the array
$findit = $tnum;
$badbody = @files[$findit];


print "$badbody is a suspect file. \n";

#Instantiates an image object with ImageMagick.        

$body = new Image::Magick();
$rb = $body->Read("$badbody");
die "$rb" if $rb;

print "Checking image $badbody for blank status...\n";

# This command shaves 100 pixels off the top/bottom  and 
# 500 off the sides so that it evaluates the middle
# of the image and not the edges to get an accurate brightness level.
$rb = $body->Shave(geometry=>'500x100');
$rb = $body->Set(page=>'0x0+0+0');

#Gathers statistics about the current file
@bstats = $body->Statistics();

$bmean = $bstats[3];
$diff = ($blmean - $bmean);  
print "The difference between $badbody and the reference file is $diff \n";

undef $rb;
undef $body;

if ($diff < 0.03) {
  print "The file $badbody is blank and is OK. \n";
  push @okblanks, $badbody;
  $bbflag = 1;
  }
  
  else {
  print "Please check the body file $badbody. \n";
  $bbflag = 0;
  print ERR "$doc - mismatch pages - $badbody is not blank \n";
  print ERR3 "$badbody,$pnum \n";
  }
                                  
#print "The bbflag is now $bbflag \n";

#Recalculate
$newtnum = ($pnum + $bnum);
if (($newtnum == $fnumber)&& ($bbflag = 1)){
  &qualcheck;
  }
else {
  &troubleshoot;
  }  


} # end sub checkbody


sub nopref {

# This runs if no statement of prefatory material is found
# on the 300 line of the MARC record

# The assumption is that the great majority of congressional document files have 4 pages of
#prefatory material before the body

$pnum = 4;

} # end sub nopref


sub nobody {

#This runs if no main body pages have been defined on the
#300 line of the MARC record, either because there were none cataloged,
#or if the 300 field was coded improperly,
#or if this is a multi-page document

print "Please check the MARC record $doc.cat . No pages were found in the 300 field. \n";
print ERR "$doc - bad MARC record $doc.cat \n";
print ERR3 "$doc \n";
print "Document will be moved to the PROBLEM directory.\n";
&moveproblem;

} #end sub nobody



sub troubleshoot {

# What we intend is to account for extra blank pages that may be present at the end of the document

print "reminder that bflag = $bflag\n";
print "reminder that fnumber = $fnumber\n";
print "reminder that newtnum = $newtnum\n";

#Odd page adjustment
if ($bflag == 1) {
	$bdiff = ($fnumber - $newtnum);
	#$bdiff = ($bdiff -1);
}

#Even pages need no adjustment
elsif ($bflag == 0) {
$bdiff = ($fnumber - $newtnum);

}
print "blanks diff = $bdiff \n";

$tflag = 0;

#Checking after the last text image file
$extrablank = ($newtnum +1);
$extrablank = sprintf("%04d", $extrablank);


until ($tflag == $bdiff) {
$tflag = $tflag +1;

#Instantiates an image object with ImageMagick.        

$extra = new Image::Magick();
$eb = $extra->Read("$doc-$extrablank.tif");
die "$eb" if $eb;

print "Checking image $extrablank for blank status...\n";

# This command shaves 100 pixels off the top/bottom  and 
# 500 off the sides so that it evaluates the middle
# of the image and not the edges to get an accurate brightness level.
$eb = $extra->Shave(geometry=>'500x100');
$eb = $extra->Set(page=>'0x0+0+0');

#Gathers statistics about the current file
@estats = $extra->Statistics();

$emean = $estats[3];
$diff = ($blmean - $emean);
print "The difference between $extrablank and the reference file is $diff \n";

#Recapturing memory
@$extra = ();

if ($diff < 0.03) {
  print "The file $extrablank is blank and is OK. \n";
  push @okblanks, $extrablank;
  $extrablank = ($extrablank +1);
  $extrablank = sprintf("%04d", $extrablank);
  #account for total pages
  $newtnum = ($newtnum + 1);
  print "newtnum recalcuated to $newtnum\n";
  
}

else {
        print "Please check the body file $extrablank. \n";        
        print ERR "$doc - mismatch pages - $extrablank is not blank \n";
        print ERR3 "$extrablank \n";
	  &nogood;
	  
        }

print "tflag is now $tflag \n";

	} # end until loop


if ($tflag == $bdiff) {
  $bbflag = 1;
    }
       
    else {
       $bbflag = 0;
       }
        
 
if (($newtnum == $fnumber)&& ($bbflag = 1)){
          &qualcheck;
            }
else {
        &nogood;
            }  
         
                     
#Undefine variables
                     
undef $eb;
undef $extra;

} 	#end sub troubleshoot


sub nogood {

print "Please check the document $doc manually.\n";
print "The total number of pages $fnumber does not match the MARC record statement of $tnum pages. \n";
print ERR "$doc - $fnumber files: Preface: $pnum, Body: $bnum, Total: $tnum \n"; 
print ERR3 "$doc \n";
&moveproblem;

}       #end sub nogood


sub congrats2 {

print "Congratulations. This document has passed its quality control check. \n";
print ERR4 "$doc\n";
&movegood;
print "OK BLANKS Array = @okblanks\n";

} #end congrats2


sub moveimagecheck {

#Moves checked files to IMAGECHECK folder for qualcheck
chdir "..";
system "mv $doc ../IMAGECHECK";
next MAIN;
}

sub movegood {
#Moves checked files to GOOD folder for processing
chdir "..";
system "mv $doc ../GOOD";
next MAIN;
}

sub moveproblem {

#Moves checked files to PROBLEM folder
chdir "..";
system "mv $doc ../PROBLEM";
next MAIN;
}

sub movesuspect {

#Moves checked files to SUSPECT folder
chdir "..";
system "mv $doc ../SUSPECT";
next MAIN;
}

sub readoutfile {

#reads $doc.out text file in $doc directory

chdir "$doc";

open (OUTFILE, "./$doc.out") or die "$doc.out cannot be opened";
while (<OUTFILE>) {
$infoline = $_;
chomp ($infoline);
print "$infoline \n";
#close OUTFILE;

@parseline = split(',', $infoline);
#doc is in @parseline[0] just in case
$pnum = @parseline[1];
print "preface = $pnum \n";
$firstpage = @parseline[2];
print "first page = $firstpage \n";
$finalpage = @parseline[3];
print "final page = $finalpage \n";

$bnum = ($finalpage - ($firstpage - 1));
print "body BNUM = $bnum \n";
$tnum = ($pnum + $bnum);
print "total TNUM = $tnum \n";

  }  #end while
    
    close OUTFILE;
    
} #end sub readoutfile
    

sub movemultiv {

chdir "$rdir"; 

system "mv $doc ../MULTIV";
#restart next doc
next MAIN;

}  # end sub movemultiv


sub readmarc {

# Reading the MARC record to determine the number of pages

#chdir "$doc";

open (CATFILE, "<$doc.cat") or die "File $doc.cat cannot be opened";

#Picking up roman numeral 
#prefatory pages and arabic numeral body pages from 300 line

while (<CATFILE>) {

 # $1 refers to (i|v|x|l|c|m)+)
 # $2 is not captured using ?: modifier
 # $3 refers to ([0-9]+)
  
     if (m/\|a.((i|v|x|l|c|m)+)(?:,.)([0-9]+)/)  {
       
         $preface = $1;
           $pages = $3;
             }
               
      } #end while
                
# Uses the Roman module to translate between Roman and Arabic numerals
# in order to perform calculations
                
$pnum = arabic($preface);
$bnum = $pages;
print "There are $pnum pages of prefatory material. \n";
print "There are $bnum pages in the body of the document. \n";
                
$tnum = ($pnum + $bnum);
                
print "The TOTAL number of pages should equal: $tnum \n";
close CATFILE;
                
                
      }  #end sub readmarc


sub qualcheck {

print "Beginning image quality control check for $doc now.\n";

#sets the QC flag
$qcflag = 0;

#Here we check all the images in the folder for brightness

chdir ("$rdir/$doc");

foreach $filey (@files) {

if (@found = grep(/\b$filey\b/,@okblanks))  {
	$output = join(",",@found);
	$output =~s/,*$//; # just in case of empty elements at the end of the array
	print "Included in list of known good blanks: $output\n";
	}

#Instantiates an image object with ImageMagick.        

$average = new Image::Magick();
$ra = $average->Read("$filey");
die "$rb" if $rb;

print "Checking image $filey for brightness . . .\n";

# This command shaves 10 pixels off the top/bottom  and 
# 500 off the sides so that it evaluates the middle
# of the image and not the edges to get an accurate brightness level.

$rb = $average->Shave(geometry=>'500x10');
$rb = $average->Set(page=>'0x0+0+0');

#Gathers statistics about the current file
@astats = $average->Statistics();

$amean = $astats[3];
$diff = ($smean - $amean);
  
  print " Mean of $filey is $amean \n";
  print "The difference between $filey and the reference file is $diff \n";

undef $rb;
undef $average;
  
#As defined by our testing – these values should be tested according to each
#institution’s needs.  
  #The mean of the clean standard file is: 0.870161 = -0.12973
  #The mean of the blank file is: 0.999974 
  #The mean of the bad 'drag' file is: 0.999493 = -0.129332 
  #Mean of "drag2" thicker smudge = 0.993365  = -0.123204
  #The mean of the bad 'black lines' file is: 0.625299   =.244862
  

if ($diff > 0.14) {
print "The file $filey appears to be too dark. Please check it. \n";
push @darkfiles, $filey;
print ERR2 "$filey,$pnum \n";
$qcflag++;
}

elsif (($diff > -0.125) && ($diff < 0.139999 )) {
print "$filey looks fine. \n";  
      }
                    
elsif (($diff < -0.124999) && ($filey eq $badpref)) {
print "$filey is OK; it's a blank even page in the preface. \n";
}

elsif (($diff < -0.124999) && (defined $output)) {
#print "$filey is OK; it's an extra blank page at the end of the document. \n";
print "$filey is OK; it's a known blank page. \n"
}

elsif ($diff < -0.124999) {print "$filey appears too light and may be blank. Please check it. \n";
push @lightfiles, $filey;
print ERR2 "$filey,$pnum \n";
$qcflag++;
}                              
                                        
} #end foreach files loop                                      
                                              
print ERR "$doc recap: \n";
print ERR "The darkfiles are: \n";
print ERR "\n  @darkfiles \n";
print ERR "The lightfiles are: \n";
print ERR "\n @lightfiles \n";

#Undefine variables
undef $badbody;
undef $badpref;
undef $output;
undef @okblanks;

#if there were no errors, everything is good to proceed
if ($qcflag == 0) {
&congrats2;
}

elsif ($qcflag > 0) {
&badnews;
}

}  #end sub qualcheck


sub badnews {

print "Document has failed Quality Control Check.\n";
print "moving $doc to SUSPECT directory for further checking.\n";
&movesuspect;
print "OK BLANKS Array = @okblanks\n";
undef (@okblanks);

}
                                                        
