#!/usr/bin/perl # # gdocpagecheckonly.pl # # Doreva Belfiore # Rutgers University Law Library # Camden, NJ # Revision date: 9/13/2011 # # This script performs quality control automatically on a series of folders of # scanned government documents. # First we get the LC # from a folder of scanned government documents and download the #matching MARC record from the Library catalog. # In doing so, we want to compare the number of pages stated in the catalog record (MARC 300) #against the number of pages actually scanned. # We then send the folders on to be checked with a secondary image checking script on a client #workstation. #1 Load required third-party modules # Roman.pm converts between Roman and Arabic numerals use Roman; #2 Set defined variables here #opens the logfiles open (ERR, ">>gdocpagecheckonly.txt"); #4 Here were are going to count the total number of pages found in the directory first. #$rdir = "../staging"; #$rdir = "../gdoc"; #changed destination directory so that processing can occur while students are still working $rdir = "../gdoc/GDOCCHECK"; opendir (DIR, "$rdir"); @lccns = grep /\d+/, readdir DIR; closedir DIR; @lccns = sort(@lccns); print "GOT LCCNS @lccns \n"; #5 MARC record retrieval #get MARC records for all documents first # this way WGET opens and is closed GM: foreach $doc (@lccns) { #test for existance of downloaded marc record if (-e "./$rdir/$doc/$doc.cat") { print "MARC record already downloaded. \n"; } else { #print "Can't find a MARC .cat file for $doc \n"; &getmarc2; } } #end GM #6 Gathering pages (grep) MAIN: foreach $doc (@lccns){ #test - may need chdir "$rdir"; print "\nInitiating quality control check for $doc now . . .\n"; print "\nCounting total number of pages for $doc . . . \n"; $lccn = $doc; opendir DIR2, "./$doc"; @files = grep /\.tif/, readdir DIR2; closedir DIR2; @files = sort(@files); $fnumber = @files; #test print "Found $fnumber files in the directory $doc. \n"; #7 Test for multivolume document set, which requires special pagination if ($doc =~ m/([0-9]{1,8})([a-z]{1,2})/) { print "Found a multivolume! \n"; $mvflag = 1; } else { print "Not a multivolume. Read the regular MARC file. \n"; &readmarc; $mvflag = 0; } if (($mvflag == 1) && (-e "./$rdir/$doc/$doc.out")) { print "Found outfile $doc.out Reading it instead of MARC record. \n"; &readoutfile; } elsif ($mvflag == 1) { print "Moving $doc to MULTIV directory for manual pagination (CGI) \n"; &movemultiv; } #8 Evaluate the number of pages stated in MARC record against the number of pages # found in the folder. print "Matching the number of pages found in the folder against the MARC record for $doc . . .\n"; # first we check that the preface and body pages have been identified CHECK1: if (defined ($pnum)) { print "$pnum \n"; } else { &nopref; } CHECK2: if (defined ($bnum)) { print "$bnum \n"; } else { &nobody; } #9 Test for match of page numbers CHECK3: if ($tnum == $fnumber) { &congrats1; } elsif ($tnum != $fnumber) { &checkprefevenodd; } else { print ERR "$doc Failure. Keep troubleshooting. \n"; } #10 Undefine variables at the end as final precaution #close the error and logfiles print "Check of $doc is complete. \n"; #close ERR; } #end MAIN ########SUBROUTINES########### sub checkprefevenodd { #checking even or odd numbers as this makes a difference #uses modulus function % #Checks the prefatory material $podd_num = $pnum % 2; if ($podd_num) { print "Prefatory page $pnum is odd. \n"; $pnum = ($pnum + 1); $pflag = 1; } else { print "Prefatory page $pnum is even. \n"; $pflag = 0; } #print "The pflag is now $pflag \n"; #next check &checkbodyevenodd; } #end sub checkprefevenodd sub checkbodyevenodd { #Checks the body pages $bodd_num = $bnum % 2; if ($bodd_num) { print "Body page $bnum is odd. \n"; $bnum = ($bnum + 1); $bflag = 1; } else { print "Body page $bnum is even. \n"; $bflag = 0; } #print "The bflag is now $bflag \n"; #next check &checkprefblanks; } #end sub checkbodyevenodd sub checkprefblanks { $newtnum = ($pnum + $bnum); print "$newtnum is the new total \n"; if (($newtnum == $fnumber) && ($pbflag = 1) && ($bflag = 0)){ &congrats1; } else { &checkbodyblanks; } } #end sub checkprefblanks sub checkbodyblanks { $bnum = ($bnum + 1); $bflag = 1; #recalculate $newtnum = ($pnum + $bnum); if (($newtnum == $fnumber)&& ($bbflag = 1)){ &congrats1; } else { &markbad; } } #end checkbodyblanks sub nopref { # This runs if no statement of prefatory material is found # on the 300 line of the MARC record. # The assumption here is that great majority of congressional document files have 4 pages of # prefatory material before the body pages. $pnum = 4; } # end sub nopref sub nobody { #This runs if no main body pages have been defined on the #300 line of the MARC record, either because there were none cataloged, #or if the 300 field was coded improperly, #or if this is a multi-page document. print "Please check the MARC record $doc.cat . No pages were found in the 300 field. \n"; print ERR "$doc, bad\n"; print "Document will be moved to the PROBLEM directory.\n"; &moveproblem; } sub congrats1 { # What the end user sees to signal them that the number of files is confirmed correct print "\nSuccess! You have the correct number of files for $doc.\n"; print ERR "$doc, good\n"; &moveimagecheck; } # end sub congrats1 sub moveimagecheck { # Moves checked files to imagecheck folder for subsequent checking chdir "../$rdir"; system "mv $doc ../IMAGECHECK"; next MAIN; } sub moveblankcheck { # Moves checked files to blankcheck folder for subsequent checking chdir "../$rdir"; system "mv $doc ../BLANKCHECK"; next MAIN; } sub moveproblem { # Moves checked files to PROBLEM folder chdir "../$rdir"; system "mv $doc ../PROBLEM"; next MAIN; } sub movesuspect { # Moves checked files to SUSPECT folder chdir "../$rdir"; system "mv $doc ../SUSPECT"; next MAIN; } sub getmarc2 { #This subroutine should be customized per library to fetch a valid MARC record for the #particular government document LCCN or document number from the institution’s catalog #using WGET or some other mechanism. The rest of the scripts expect to read a text file called #.cat. } #end sub getmarc2 sub readoutfile { # Reads $doc.out text file in $doc directory open (OUTFILE, "./$doc/$doc.out"); while () { $infoline = $_; chomp ($infoline); print "$infoline \n"; close OUTFILE; @parseline = split(',', $infoline); #doc is in @parseline[0] just in case $pnum = @parseline[1]; print "preface = $pnum \n"; $firstpage = @parseline[2]; print "first page = $firstpage \n"; $finalpage = @parseline[3]; print "final page = $finalpage \n"; $bnum = ($finalpage - ($firstpage - 1)); print "body BNUM = $bnum \n"; $tnum = ($pnum + $bnum); print "total TNUM = $tnum \n"; } #end while #close OUTFILE; } #end sub readoutfile sub movemultiv { # Moves multivolume sets to the staging directory # MULTIV to be processed by CGI script for pagination chdir "$rdir"; system "mv $doc ../MULTIV"; #restart next doc next MAIN; } # end sub movemultiv sub readmarc { # Reading the MARC record to determine the number of pages # Script expects a text file named .cat # Script can be adjusted according to your specific site needs open (CATFILE, "<./$doc/$doc.cat"); #Looking to pick up roman numeral #prefatory pages and arabic numeral body pages from 300 line GMD while () { # $1 refers to (i|v|x|l|c|m)+) # $2 is not captured using ?: modifier # $3 refers to ([0-9]+) if (m/\|a.((i|v|x|l|c|m)+)(?:,.)([0-9]+)/) { $preface = $1; $pages = $3; } } #end while # Uses the Roman third-party module to translate between Roman and Arabic numerals # in order to perform calculations $pnum = arabic($preface); $bnum = $pages; print "There are $pnum pages of prefatory material. \n"; print "There are $bnum pages in the body of the document. \n"; $tnum = ($pnum + $bnum); print "The TOTAL number of pages should equal: $tnum \n"; close CATFILE; } #end sub readmarc sub markbad { # Evaluation of pages $checkdiff = ($fnumber - $tnum); if ($checkdiff == 1) { print ERR "$doc, good\n"; &moveimagecheck; } if ($checkdiff >= 2) { print ERR "$doc, $checkdiff blanks\n"; &moveblankcheck; } elsif ($fnumber < $tnum) { print ERR "$doc, $checkdiff missing\n"; &moveproblem; } } #end sub markbad