#!/usr/bin/perl
#
# gdocpagecheckonly.pl
#
# Doreva Belfiore
# Rutgers University Law Library
# Camden, NJ

# Revision date: 9/13/2011
#

# This script performs quality control automatically on a series of folders of 
# scanned government documents.
# First we get the LC # from a folder of scanned government documents and download the #matching MARC record from the Library catalog.
# In doing so, we want to compare the number of pages stated in the catalog record (MARC 300) #against the number of pages actually scanned.
# We then send the folders on to be checked with a secondary image checking script on a client #workstation. 


#1 Load required third-party modules
# Roman.pm converts between Roman and Arabic numerals
use Roman;

#2 Set defined variables here

#opens the logfiles

open (ERR, ">>gdocpagecheckonly.txt");


#4 Here were are going to count the total number of pages found in the directory first.

#$rdir = "../staging";
#$rdir = "../gdoc";

#changed destination directory so that processing can occur while students are still working
$rdir = "../gdoc/GDOCCHECK";

opendir (DIR, "$rdir");
@lccns = grep /\d+/, readdir DIR;
closedir DIR;
@lccns = sort(@lccns);
             
print  "GOT LCCNS @lccns \n";
                          
#5 MARC record retrieval
#get MARC records for all documents first
# this way WGET opens and is closed

GM: foreach $doc (@lccns) {

#test for existance of downloaded marc record

if (-e "./$rdir/$doc/$doc.cat") {
  print "MARC record already downloaded. \n";
    }
    
  else {
      #print "Can't find a MARC .cat file for $doc \n";
      &getmarc2;
      }
            
}  #end GM
            

#6 Gathering pages (grep)

MAIN: foreach $doc (@lccns){

#test - may need
chdir "$rdir";

             
print "\nInitiating quality control check for $doc now . . .\n";
print "\nCounting total number of pages for  $doc . . . \n";
$lccn = $doc;

opendir DIR2, "./$doc";
@files = grep /\.tif/, readdir DIR2;
closedir DIR2;
@files = sort(@files);
$fnumber = @files;
                                
#test
print "Found $fnumber files in the directory $doc. \n";
                                                                        

#7  Test for multivolume document set, which requires special pagination

if ($doc =~ m/([0-9]{1,8})([a-z]{1,2})/) {
print "Found a multivolume! \n";
$mvflag = 1;
}

else {
print "Not a multivolume. Read the regular MARC file. \n";
&readmarc;  
$mvflag = 0;
}


if (($mvflag == 1) && (-e "./$rdir/$doc/$doc.out")) {
print "Found outfile $doc.out  Reading it instead of MARC record. \n";
&readoutfile;
}

elsif ($mvflag == 1) {
print "Moving $doc to MULTIV directory for manual pagination (CGI) \n";
&movemultiv;
}


#8 Evaluate the number of pages stated in MARC record against the number of pages
# found in the folder.

print "Matching the number of pages found in the folder against the MARC record for $doc . . .\n";

# first we check that the preface and body pages have been identified

CHECK1: 
if (defined ($pnum)) {
  print "$pnum \n";
}
  else {
  &nopref;
  }

CHECK2: 
if (defined ($bnum)) {
  print "$bnum \n";
  }
  else {
  &nobody;
  }

#9 Test for match of page numbers

CHECK3:

  if ($tnum == $fnumber) {
    &congrats1;
    }
  elsif ($tnum != $fnumber) {
    &checkprefevenodd;
    }
  else {
    print ERR "$doc Failure. Keep troubleshooting. \n";
    }
  
       
#10  Undefine variables at the end as final precaution

#close the error and logfiles
print "Check of $doc is complete. \n";  
#close ERR;


}	#end MAIN


########SUBROUTINES###########

sub checkprefevenodd {

#checking even or odd numbers as this makes a difference
#uses modulus function %

#Checks the prefatory material
$podd_num = $pnum % 2;
if ($podd_num) {
   print "Prefatory page $pnum is odd. \n"; 
   $pnum = ($pnum + 1);
   $pflag = 1;
   }
else { 
  print "Prefatory page $pnum is even. \n"; 
  $pflag = 0;
  }

#print "The pflag is now  $pflag \n";

#next check
&checkbodyevenodd;

}  #end sub checkprefevenodd


sub checkbodyevenodd {

#Checks the body pages
$bodd_num = $bnum % 2; 
if ($bodd_num) { 
  print "Body page $bnum is odd. \n"; 
  $bnum = ($bnum + 1);
  $bflag = 1;
  }
else { 
  print "Body page $bnum is even. \n";
  $bflag = 0;
  }

#print "The bflag is now $bflag \n";

#next check
&checkprefblanks;


}  #end sub checkbodyevenodd


sub checkprefblanks {

$newtnum = ($pnum + $bnum);
print "$newtnum is the new total \n";
if (($newtnum == $fnumber) && ($pbflag = 1) && ($bflag = 0)){
  &congrats1;
    }
    else {
      &checkbodyblanks;
        }
        
} #end sub checkprefblanks


sub checkbodyblanks {

$bnum = ($bnum + 1);
  $bflag = 1;

#recalculate
$newtnum = ($pnum + $bnum);
if (($newtnum == $fnumber)&& ($bbflag = 1)){
  &congrats1;
    }
    else {
      &markbad;
        }  
        
} #end checkbodyblanks


sub nopref {

# This runs if no statement of prefatory material is found
# on the 300 line of the MARC record.

# The assumption here is that great majority of congressional document files have 4 pages of
# prefatory material before the body pages.

$pnum = 4;

} # end sub nopref


sub nobody {

#This runs if no main body pages have been defined on the
#300 line of the MARC record, either because there were none cataloged,
#or if the 300 field was coded improperly,
#or if this is a multi-page document.

print "Please check the MARC record $doc.cat . No pages were found in the 300 field. \n";
print ERR "$doc, bad\n";
print "Document will be moved to the PROBLEM directory.\n";
&moveproblem;
}


sub congrats1 {

# What the end user sees to signal them that the number of files is confirmed correct

print "\nSuccess! You have the correct number of files for $doc.\n";
print ERR "$doc, good\n";
&moveimagecheck;

} # end sub congrats1


sub moveimagecheck {

# Moves checked files to imagecheck folder for subsequent checking
chdir "../$rdir";
system "mv $doc ../IMAGECHECK";
next MAIN;
}


sub moveblankcheck {

# Moves checked files to blankcheck folder for subsequent checking
chdir "../$rdir";
system "mv $doc ../BLANKCHECK";
next MAIN;
}

sub moveproblem {

# Moves checked files to PROBLEM folder
chdir "../$rdir";
system "mv $doc ../PROBLEM";
next MAIN;
}

sub movesuspect {

# Moves checked files to SUSPECT folder
chdir "../$rdir";
system "mv $doc ../SUSPECT";
next MAIN;
}

 
sub getmarc2 {
 
#This subroutine should be customized per library to fetch a valid MARC record for the #particular government document LCCN or document number from the institution’s catalog #using WGET or some other mechanism. The rest of the scripts expect to read a text file called #<lccn>.cat.

} #end sub getmarc2               


sub readoutfile {

# Reads $doc.out text file in $doc directory

open (OUTFILE, "./$doc/$doc.out");
while (<OUTFILE>) {
$infoline = $_;
chomp ($infoline);
print "$infoline \n";
close OUTFILE;

@parseline = split(',', $infoline);
#doc is in @parseline[0] just in case
$pnum = @parseline[1];
print "preface = $pnum \n";
$firstpage = @parseline[2];
print "first page = $firstpage \n";
$finalpage = @parseline[3];
print "final page = $finalpage \n";

$bnum = ($finalpage - ($firstpage - 1));
print "body BNUM = $bnum \n";
$tnum = ($pnum + $bnum);
print "total TNUM = $tnum \n";

  }  #end while
    
    #close OUTFILE;
    
} #end sub readoutfile
    

sub movemultiv {

# Moves multivolume sets to the staging directory
# MULTIV to be processed by CGI script for pagination

chdir "$rdir";

system "mv $doc ../MULTIV";
#restart next doc
next MAIN;

}  # end sub movemultiv


sub readmarc {

# Reading the MARC record to determine the number of pages
# Script expects a text file named <lccn>.cat
# Script can be adjusted according to your specific site needs 
open (CATFILE, "<./$doc/$doc.cat");

#Looking to pick up roman numeral 
#prefatory pages and arabic numeral body pages from 300 line GMD

while (<CATFILE>) {

 # $1 refers to (i|v|x|l|c|m)+)
 # $2 is not captured using ?: modifier
 # $3 refers to ([0-9]+)
  
     if (m/\|a.((i|v|x|l|c|m)+)(?:,.)([0-9]+)/)  {
       
         $preface = $1;
           $pages = $3;
             }
               
      } #end while
                
# Uses the Roman third-party module to translate between Roman and Arabic numerals
# in order to perform calculations
                
$pnum = arabic($preface);
$bnum = $pages;
print "There are $pnum pages of prefatory material. \n";
print "There are $bnum pages in the body of the document. \n";
                
$tnum = ($pnum + $bnum);
                
print "The TOTAL number of pages should equal: $tnum \n";
close CATFILE;
                
                
}  #end sub readmarc


sub markbad {

# Evaluation of pages

$checkdiff = ($fnumber - $tnum);

if ($checkdiff == 1) {
print ERR "$doc, good\n";
&moveimagecheck;
}

if ($checkdiff >= 2) {
print ERR "$doc, $checkdiff blanks\n";
&moveblankcheck;
}

elsif ($fnumber < $tnum) {
print ERR "$doc, $checkdiff missing\n";
&moveproblem;
}

} #end sub markbad      
