#! /usr/bin/perl -w
#
#
# gdocblankcheck.pl
#
# Doreva Belfiore
# Rutgers University Law Library
# Camden, NJ

# Revision date: 8/27/2011
#
#
# Latest revisions: use on Ubuntu Linux
# N.B. – this script is written to run under Ubuntu Linux with an established sshfs share called ./camlaw . 
# Adjust this share name to your institution’s needs.
#
#  This script performs quality control automatically on a 
#  series of folders of scanned government documents.
#  First, it retrieves some reference information into memory about what a blank pages "looks # #like" to ImageMagick. Then we get the LC # from a folder of scanned government documents #and download the matching MARC record from the catalog
#  In doing so, we want to compare the number of pages stated in the catalog record (MARC #300) against the number of pages actually scanned.
#  By reading blanks, we can see if there are extra blanks at the end of the file and then signal to #a QC checker that there may be problems with a folder.
# 

#  Any documents with bad pages will be placed into the PROBLEM directory to be handled by #a staff member.
#  Documents which pass the initial page count testing will be placed into the IMAGECHECK #directory for further processing.


#1 Load required third-party modules
#ImageMagick and Roman should be pre-installed to your system
use Image::Magick;
use Roman;

#2 Set defined variables here

my($document, $blank);
my($rg, $prefmat);
my($rp, $fixer, $ff);
my($rb, $body, $ra, $average);
my @okblanks =();

#opens the logfiles

open (ERR, ">>gdocblankcheck.txt");
open (ERR3, ">>gdocproblem.txt");

#3 Load the initial reference information
# Takes a while to process at first, but only loads once per script run

$gdir = ("./reference");
chdir ("$gdir");

print "Loading reference information now. Please wait... \n";

#Instantiates an image object in ImageMagick.

$gdir = ("./reference");
chdir ("$gdir");


#3b Blank File
            
$blank = new Image::Magick();   
$rb = $blank->Read('blank.tif');
die "$rb" if $rb;
            
#This command shaves 0 pixels off the top/bottom and 
# 500  pixels off the sides so that any black edges
#of the image will not skew the measurement
#to get an accurate brightness level.
#Adjust this value to your institution’s needs.
            
$rb = $blank->Shave(geometry=>'500x10');
$rb = $blank->Set(page=>'0x0+0+0');
#test write to view
#$rb = $blank->Write(filename=>'blank_trim.tif', compression=>'None');
            
#Gathers statistics about the blank page file
            
@bstats = $blank->Statistics();
            
$blmean = $bstats[3];                  
print "The mean of the blank file is: $blmean \n";
                              
undef $rb;                             
undef $blank;


#4 Here we are going to count the total number of pages found in the directory first and account #for any blanks.

chdir "..";

$rdir = ("../INSERT SERVER PATH HERE/BLANKCHECK");
#chdir ("$rdir");

opendir (DIR, "$rdir");
@lccns = grep /\d+/, readdir DIR;
closedir DIR;
@lccns = sort(@lccns);
             
print  "GOT LCCNS @lccns \n";
                          

#5 Gathering pages (grep)

MAIN: foreach $doc (@lccns){

chdir "$rdir";             
print "\nInitiating quality control check for $doc now . . .\n";
print "\nCounting total number of pages for  $doc . . . \n";

opendir DIR2, "$doc";
@files = grep /\.tif/, readdir DIR2;
closedir DIR2;
@files = sort(@files);
$fnumber = @files;
                                
#test
print "Found $fnumber files in the directory $doc. \n";
                                                                        

#6  Test for existence of multivolume document set

chdir "$doc";

if ($doc =~ m/([0-9]{1,8})([a-z]{1,2})/) {
print "Found a multivolume! \n";
$mvflag = 1;
}

else {
print "Not a multivolume. Read the regular MARC file. \n";
&readmarc;  
$mvflag = 0;
}

if ($mvflag == 1) {
print "Reading outfile instead of MARC record. \n";
&readoutfile;
}


#7 Evaluate number of pages stated in MARC record against the number of pages
# found in the folder.

print "Matching the number of pages found in the folder against the MARC record for $doc . . .\n";

# First we check that the preface and body pages have been identified

CHECK1: 
if (defined ($pnum)) {
  print "$pnum \n";
}
  else {
  &nopref;
  }

CHECK2: 
if (defined ($bnum)) {
  print "$bnum \n";
  }
  else {
  &nobody;
  }

#8 Test for match of page numbers

CHECK3:

  if ($tnum == $fnumber) {
    &congrats1;
    }
  elsif ($tnum != $fnumber) {
    &checkprefevenodd;
    }
  else {
    print "Failure. Keep troubleshooting. \n";
    }
   

}	#end MAIN



#10  Undefine variables at the end as final precaution

undef $document;
undef $rg;
undef $prefmat;
undef $rp;
undef $fixer;
undef $ff;
undef $rb;
undef $body;
undef $average;  
undef $fnumber;
undef $tnum;
undef $bnum;

for (keys %okblankhash)
 {
 delete $okblankshash{$_};
 }

#close the error and logfiles 
close ERR;
close ERR3;

########SUBROUTINES###########

sub checkprefevenodd {

#checking even or odd numbers as this makes a difference
#uses modulus function %

#Checks the prefatory material
$podd_num = $pnum % 2;
if ($podd_num) {
   print "Prefatory page $pnum is odd. \n"; 
   $pnum = ($pnum + 1);
   $pflag = 1;
   }
else { 
  print "Prefatory page $pnum is even. \n"; 
  $pflag = 0;
  }

#print "The pflag is now  $pflag \n";

#next check
&checkbodyevenodd;

}  #end sub checkprefevenodd


sub checkbodyevenodd {

#Checks the body pages
$bodd_num = $bnum % 2; 
if ($bodd_num) { 
  print "Body page $bnum is odd. \n"; 
  $bnum = ($bnum + 1);
  $bflag = 1;
  }
else { 
  print "Body page $bnum is even. \n";
  $bflag = 0;
  }

#print "The bflag is now $bflag \n";

#next check
&checkprefblanks;

}  #end sub checkbodyevenodd



sub checkprefblanks {

print "Checking document $doc for any blank pages after the prefatory material \n";
print "or after the body pages. \n";

# Checks the prefatory material to see if there is a trailing blank page
#after an odd number of pages

#find the suspect preface file
$findit = $pnum-1;
$badpref = @files[$findit];

print "$badpref is a suspect file. \n";

#Instantiates an image object with ImageMagick.        

$prefmat = new Image::Magick();
$rp = $prefmat->Read("$badpref");
die "$rp" if $rp;

print "Checking image $badpref for blank status . . .\n";

# This command shaves 100 pixels off the top/bottom  and 
# 500 off the sides so that it evaluates the middle
#of the image and not the edges to get an accurate brightness level.
# Parameters can be adjusted as needed.
$rp = $prefmat->Shave(geometry=>'500x100');
$rp = $prefmat->Set(page=>'0x0+0+0');

#Gathers statistics about the current file
@avstats = $prefmat->Statistics();

$pmean = $avstats[3];
$diff = ($blmean - $pmean);  
print "The difference between $badpref and the reference file is $diff \n";

undef $rp;
undef $prefmat;


if ($diff < 0.03) {
  print "The file $badpref is blank and is OK. \n";
  push @okblanks, $badpref;
  $pbflag = 1;
  }
  
  else {
  print "Please check the prefatory file $badpref. \n";
  print ERR "$doc - prefatory file $badpref is not blank \n";
  print ERR3 "$badpref,$pnum \n";
  $pbflag = 0;
  }
                                  
#print "The pbflag is now $pbflag \n";

#recalculate
$newtnum = ($pnum + $bnum);
print "$newtnum is the new total \n";
if (($newtnum == $fnumber) && ($pbflag = 1)){
  &congrats1;
  }
else {
  &checkbodyblanks;
  }

} # end sub checkpref                                


sub checkbodyblanks {

#Checks the body if there is a trailing blank page,
#especially after an odd number of pages.

#Find the suspect body file

#Note array starts with zero, so bad file will = $tnum not 
#$tnum +1 in the array
$findit = $tnum;
$badbody = @files[$findit];

print "$badbody is a suspect file. \n";

#Test for badbody - may be missing

if (-e $badbody) {

#Instantiates an image object with ImageMagick.        

$body = new Image::Magick();
$rb = $body->Read("$badbody");
die "$rb" if $rb;

print "Checking image $badbody for blank status...\n";

# This command shaves 100 pixels off the top/bottom  and 
# 500 off the sides so that it evaluates the middle
# of the image and not the edges to get an accurate brightness level.
$rb = $body->Shave(geometry=>'500x100');
$rb = $body->Set(page=>'0x0+0+0');

#Gathers statistics about the current file
@bstats = $body->Statistics();

$bmean = $bstats[3];
$diff = ($blmean - $bmean);  
print "The difference between $badbody and the reference file is $diff \n";

undef $rb;
undef $body;

if ($diff < 0.03) {
  print "The file $badbody is blank and is OK. \n";
  push @okblanks, $badbody;
  $bbflag = 1;
  }
  
  else {
  print "Please check the body file $badbody. \n";
  $bbflag = 0;
  print ERR "$doc - mismatch pages - $badbody is not blank \n";
  print ERR3 "$badbody,$pnum \n";
  }
                                  
#print "The bbflag is now $bbflag \n";

$newtnum = ($pnum + $bnum);
if (($newtnum == $fnumber)&& ($bbflag = 1)){
  &congrats1;
  }
else {
  &troubleshoot;
  }  

} #endif test for $badbody

else {

print "$badbody not found - missing files\n";
&nogood;
}

} # end sub checkbody


sub nopref {

# This runs if no statement of prefatory material is found
# on the 300 line of the MARC record

#The assumption here now is that great majority of congressional document files have 4 pages of
#prefatory material before the body.

$pnum = 4;

} # end sub nopref


sub nobody {

#This runs if no main body pages have been defined on the
#300 line of the MARC record, either because there were none cataloged,
#or if the 300 field was coded improperly,
#or if this is a multi-page document.

print "Please check the MARC record $doc.cat . No pages were found in the 300 field. \n";
print ERR "$doc - bad MARC record $doc.cat \n";
print ERR3 "$doc \n";
print "Document will be moved to the PROBLEM directory.\n";
&moveproblem;

} #end sub nobody


sub congrats1 {

#What the end user sees to signal them that the number of files is confirmed correct

print "\nSuccess! You have the correct number of files for $doc.\n";
print "Continuing on to quality control check. \n";
&moveimagecheck;

} # end sub congrats1


sub troubleshoot {

# What we intend to do here is account for extra blank pages that
# may be present, and often are present, at the end of the congressional document.

# Odd page adjustment

if ($bflag == 1) {
	$bdiff = ($fnumber - $newtnum);
	#$bdiff = ($bdiff -1);
}

# Even pages need no adjustment
elsif ($bflag == 0) {
$bdiff = ($fnumber - $newtnum);

}
print "blanks diff = $bdiff \n";

$tflag = 0;

#Checking after the last text image file
$extrablank = ($newtnum +1);
$extrablank = sprintf("%04d", $extrablank);


until ($tflag == $bdiff) {
$tflag = $tflag +1;

#Instantiates an image object with ImageMagick.        

$extra = new Image::Magick();
$eb = $extra->Read("$doc-$extrablank.tif");
die "$eb" if $eb;

print "Checking image $extrablank for blank status...\n";

# This command shaves 100 pixels off the top/bottom  and 
# 500 off the sides so that it evaluates the middle
# of the image and not the edges to get an accurate brightness level.
$eb = $extra->Shave(geometry=>'500x100');
$eb = $extra->Set(page=>'0x0+0+0');

#Gathers statistics about the current file
@estats = $extra->Statistics();

$emean = $estats[3];
$diff = ($blmean - $emean);
print "The difference between $extrablank and the reference file is $diff \n";

#Recapturing memory
@$extra = ();

if ($diff < 0.03) {
  print "The file $extrablank is blank and is OK. \n";
  $extrablank = ($extrablank +1);
  $extrablank = sprintf("%04d", $extrablank);
  #account for total pages
  $newtnum = ($newtnum + 1);
  print "newtnum recalcuated to $newtnum\n";
}

else {
        print "Please check the body file $extrablank. \n";        
        print ERR "$doc - mismatch pages - $extrablank is not blank \n";
        print ERR3 "$extrablank \n";
	  &nogood;
	  
        }

print "tflag is now $tflag \n";

	} # end until loop


if ($tflag == $bdiff) {
  $bbflag = 1;
    }
       
    else {
       $bbflag = 0;
       }
                
if (($newtnum == $fnumber)&& ($bbflag = 1)){
          &congrats1;
            }
else {
        &nogood;
            }  
                     
#undefine variables
                     
undef $eb;
undef $extra;

} 	#end sub troubleshoot


sub nogood {

print "Please check the document $doc manually.\n";
print "The total number of pages $fnumber does not match the MARC record statement of $tnum pages. \n";
print ERR "$doc - $fnumber files: Preface: $pnum, Body: $bnum, Total: $tnum \n"; 
print ERR3 "$doc \n";
&moveproblem;

}       #end sub nogood


sub congrats2 {

print "Congratulations. This document has passed its quality control check. \n";
&movegood;
print "OK blanks array = @okblanks\n";
} #end congrats2


sub moveimagecheck {

#moves checked files to IMAGECHECK folder for qualcheck
chdir "..";
system "mv $doc ../IMAGECHECK";
next MAIN;
}


sub moveproblem {

#moves checked files to PROBLEM folder
chdir "..";
system "mv $doc ../PROBLEM";
next MAIN;
}


sub readoutfile {

#reads $doc.out text file in $doc directory

chdir "$doc";

open (OUTFILE, "./$doc.out") or die "$doc.out cannot be opened";
while (<OUTFILE>) {
$infoline = $_;
chomp ($infoline);
print "$infoline \n";
#close OUTFILE;

@parseline = split(',', $infoline);
#doc is in @parseline[0] just in case
$pnum = @parseline[1];
print "preface = $pnum \n";
$firstpage = @parseline[2];
print "first page = $firstpage \n";
$finalpage = @parseline[3];
print "final page = $finalpage \n";

$bnum = ($finalpage - ($firstpage - 1));
print "body BNUM = $bnum \n";
$tnum = ($pnum + $bnum);
print "total TNUM = $tnum \n";

  }  #end while
    
    close OUTFILE;
    
} #end sub readoutfile
    

sub movemultiv {

chdir "$rdir"; 

#system "mv $doc ./MULTIV";
#restart next doc
next MAIN;

}  # end sub movemultiv


sub readmarc {

# Reading the MARC record to determine the number of pages

#chdir "$doc";

open (CATFILE, "<$doc.cat") or die "File $doc.cat cannot be opened";

#Testing GMD
#Basically looking for something that can pick up roman numeral 
#prefatory pages and arabic numeral body pages from 300 line

while (<CATFILE>) {

 # Regular expression retrieves preface & pages
 # $1 refers to (i|v|x|l|c|m)+)
 # $2 is not captured using ?: modifier
 # $3 refers to ([0-9]+)
  
     if (m/\|a.((i|v|x|l|c|m)+)(?:,.)([0-9]+)/)  {
       
         $preface = $1;
           $pages = $3;
             }
               
      } #end while
                
# Uses the Roman module to translate between Roman and Arabic numerals
# in order to perform calculations
                
$pnum = arabic($preface);
$bnum = $pages;
print "There are $pnum pages of prefatory material. \n";
print "There are $bnum pages in the body of the document. \n";
                
$tnum = ($pnum + $bnum);
                
print "The TOTAL number of pages should equal: $tnum \n";
close CATFILE;
                
                
      }  #end sub readmarc
                                               
