#! /usr/bin/perl -w # # # gdocblankcheck.pl # # Doreva Belfiore # Rutgers University Law Library # Camden, NJ # Revision date: 8/27/2011 # # # Latest revisions: use on Ubuntu Linux # N.B. – this script is written to run under Ubuntu Linux with an established sshfs share called ./camlaw . # Adjust this share name to your institution’s needs. # # This script performs quality control automatically on a # series of folders of scanned government documents. # First, it retrieves some reference information into memory about what a blank pages "looks # #like" to ImageMagick. Then we get the LC # from a folder of scanned government documents #and download the matching MARC record from the catalog # In doing so, we want to compare the number of pages stated in the catalog record (MARC #300) against the number of pages actually scanned. # By reading blanks, we can see if there are extra blanks at the end of the file and then signal to #a QC checker that there may be problems with a folder. # # Any documents with bad pages will be placed into the PROBLEM directory to be handled by #a staff member. # Documents which pass the initial page count testing will be placed into the IMAGECHECK #directory for further processing. #1 Load required third-party modules #ImageMagick and Roman should be pre-installed to your system use Image::Magick; use Roman; #2 Set defined variables here my($document, $blank); my($rg, $prefmat); my($rp, $fixer, $ff); my($rb, $body, $ra, $average); my @okblanks =(); #opens the logfiles open (ERR, ">>gdocblankcheck.txt"); open (ERR3, ">>gdocproblem.txt"); #3 Load the initial reference information # Takes a while to process at first, but only loads once per script run $gdir = ("./reference"); chdir ("$gdir"); print "Loading reference information now. Please wait... \n"; #Instantiates an image object in ImageMagick. $gdir = ("./reference"); chdir ("$gdir"); #3b Blank File $blank = new Image::Magick(); $rb = $blank->Read('blank.tif'); die "$rb" if $rb; #This command shaves 0 pixels off the top/bottom and # 500 pixels off the sides so that any black edges #of the image will not skew the measurement #to get an accurate brightness level. #Adjust this value to your institution’s needs. $rb = $blank->Shave(geometry=>'500x10'); $rb = $blank->Set(page=>'0x0+0+0'); #test write to view #$rb = $blank->Write(filename=>'blank_trim.tif', compression=>'None'); #Gathers statistics about the blank page file @bstats = $blank->Statistics(); $blmean = $bstats[3]; print "The mean of the blank file is: $blmean \n"; undef $rb; undef $blank; #4 Here we are going to count the total number of pages found in the directory first and account #for any blanks. chdir ".."; $rdir = ("../INSERT SERVER PATH HERE/BLANKCHECK"); #chdir ("$rdir"); opendir (DIR, "$rdir"); @lccns = grep /\d+/, readdir DIR; closedir DIR; @lccns = sort(@lccns); print "GOT LCCNS @lccns \n"; #5 Gathering pages (grep) MAIN: foreach $doc (@lccns){ chdir "$rdir"; print "\nInitiating quality control check for $doc now . . .\n"; print "\nCounting total number of pages for $doc . . . \n"; opendir DIR2, "$doc"; @files = grep /\.tif/, readdir DIR2; closedir DIR2; @files = sort(@files); $fnumber = @files; #test print "Found $fnumber files in the directory $doc. \n"; #6 Test for existence of multivolume document set chdir "$doc"; if ($doc =~ m/([0-9]{1,8})([a-z]{1,2})/) { print "Found a multivolume! \n"; $mvflag = 1; } else { print "Not a multivolume. Read the regular MARC file. \n"; &readmarc; $mvflag = 0; } if ($mvflag == 1) { print "Reading outfile instead of MARC record. \n"; &readoutfile; } #7 Evaluate number of pages stated in MARC record against the number of pages # found in the folder. print "Matching the number of pages found in the folder against the MARC record for $doc . . .\n"; # First we check that the preface and body pages have been identified CHECK1: if (defined ($pnum)) { print "$pnum \n"; } else { &nopref; } CHECK2: if (defined ($bnum)) { print "$bnum \n"; } else { &nobody; } #8 Test for match of page numbers CHECK3: if ($tnum == $fnumber) { &congrats1; } elsif ($tnum != $fnumber) { &checkprefevenodd; } else { print "Failure. Keep troubleshooting. \n"; } } #end MAIN #10 Undefine variables at the end as final precaution undef $document; undef $rg; undef $prefmat; undef $rp; undef $fixer; undef $ff; undef $rb; undef $body; undef $average; undef $fnumber; undef $tnum; undef $bnum; for (keys %okblankhash) { delete $okblankshash{$_}; } #close the error and logfiles close ERR; close ERR3; ########SUBROUTINES########### sub checkprefevenodd { #checking even or odd numbers as this makes a difference #uses modulus function % #Checks the prefatory material $podd_num = $pnum % 2; if ($podd_num) { print "Prefatory page $pnum is odd. \n"; $pnum = ($pnum + 1); $pflag = 1; } else { print "Prefatory page $pnum is even. \n"; $pflag = 0; } #print "The pflag is now $pflag \n"; #next check &checkbodyevenodd; } #end sub checkprefevenodd sub checkbodyevenodd { #Checks the body pages $bodd_num = $bnum % 2; if ($bodd_num) { print "Body page $bnum is odd. \n"; $bnum = ($bnum + 1); $bflag = 1; } else { print "Body page $bnum is even. \n"; $bflag = 0; } #print "The bflag is now $bflag \n"; #next check &checkprefblanks; } #end sub checkbodyevenodd sub checkprefblanks { print "Checking document $doc for any blank pages after the prefatory material \n"; print "or after the body pages. \n"; # Checks the prefatory material to see if there is a trailing blank page #after an odd number of pages #find the suspect preface file $findit = $pnum-1; $badpref = @files[$findit]; print "$badpref is a suspect file. \n"; #Instantiates an image object with ImageMagick. $prefmat = new Image::Magick(); $rp = $prefmat->Read("$badpref"); die "$rp" if $rp; print "Checking image $badpref for blank status . . .\n"; # This command shaves 100 pixels off the top/bottom and # 500 off the sides so that it evaluates the middle #of the image and not the edges to get an accurate brightness level. # Parameters can be adjusted as needed. $rp = $prefmat->Shave(geometry=>'500x100'); $rp = $prefmat->Set(page=>'0x0+0+0'); #Gathers statistics about the current file @avstats = $prefmat->Statistics(); $pmean = $avstats[3]; $diff = ($blmean - $pmean); print "The difference between $badpref and the reference file is $diff \n"; undef $rp; undef $prefmat; if ($diff < 0.03) { print "The file $badpref is blank and is OK. \n"; push @okblanks, $badpref; $pbflag = 1; } else { print "Please check the prefatory file $badpref. \n"; print ERR "$doc - prefatory file $badpref is not blank \n"; print ERR3 "$badpref,$pnum \n"; $pbflag = 0; } #print "The pbflag is now $pbflag \n"; #recalculate $newtnum = ($pnum + $bnum); print "$newtnum is the new total \n"; if (($newtnum == $fnumber) && ($pbflag = 1)){ &congrats1; } else { &checkbodyblanks; } } # end sub checkpref sub checkbodyblanks { #Checks the body if there is a trailing blank page, #especially after an odd number of pages. #Find the suspect body file #Note array starts with zero, so bad file will = $tnum not #$tnum +1 in the array $findit = $tnum; $badbody = @files[$findit]; print "$badbody is a suspect file. \n"; #Test for badbody - may be missing if (-e $badbody) { #Instantiates an image object with ImageMagick. $body = new Image::Magick(); $rb = $body->Read("$badbody"); die "$rb" if $rb; print "Checking image $badbody for blank status...\n"; # This command shaves 100 pixels off the top/bottom and # 500 off the sides so that it evaluates the middle # of the image and not the edges to get an accurate brightness level. $rb = $body->Shave(geometry=>'500x100'); $rb = $body->Set(page=>'0x0+0+0'); #Gathers statistics about the current file @bstats = $body->Statistics(); $bmean = $bstats[3]; $diff = ($blmean - $bmean); print "The difference between $badbody and the reference file is $diff \n"; undef $rb; undef $body; if ($diff < 0.03) { print "The file $badbody is blank and is OK. \n"; push @okblanks, $badbody; $bbflag = 1; } else { print "Please check the body file $badbody. \n"; $bbflag = 0; print ERR "$doc - mismatch pages - $badbody is not blank \n"; print ERR3 "$badbody,$pnum \n"; } #print "The bbflag is now $bbflag \n"; $newtnum = ($pnum + $bnum); if (($newtnum == $fnumber)&& ($bbflag = 1)){ &congrats1; } else { &troubleshoot; } } #endif test for $badbody else { print "$badbody not found - missing files\n"; &nogood; } } # end sub checkbody sub nopref { # This runs if no statement of prefatory material is found # on the 300 line of the MARC record #The assumption here now is that great majority of congressional document files have 4 pages of #prefatory material before the body. $pnum = 4; } # end sub nopref sub nobody { #This runs if no main body pages have been defined on the #300 line of the MARC record, either because there were none cataloged, #or if the 300 field was coded improperly, #or if this is a multi-page document. print "Please check the MARC record $doc.cat . No pages were found in the 300 field. \n"; print ERR "$doc - bad MARC record $doc.cat \n"; print ERR3 "$doc \n"; print "Document will be moved to the PROBLEM directory.\n"; &moveproblem; } #end sub nobody sub congrats1 { #What the end user sees to signal them that the number of files is confirmed correct print "\nSuccess! You have the correct number of files for $doc.\n"; print "Continuing on to quality control check. \n"; &moveimagecheck; } # end sub congrats1 sub troubleshoot { # What we intend to do here is account for extra blank pages that # may be present, and often are present, at the end of the congressional document. # Odd page adjustment if ($bflag == 1) { $bdiff = ($fnumber - $newtnum); #$bdiff = ($bdiff -1); } # Even pages need no adjustment elsif ($bflag == 0) { $bdiff = ($fnumber - $newtnum); } print "blanks diff = $bdiff \n"; $tflag = 0; #Checking after the last text image file $extrablank = ($newtnum +1); $extrablank = sprintf("%04d", $extrablank); until ($tflag == $bdiff) { $tflag = $tflag +1; #Instantiates an image object with ImageMagick. $extra = new Image::Magick(); $eb = $extra->Read("$doc-$extrablank.tif"); die "$eb" if $eb; print "Checking image $extrablank for blank status...\n"; # This command shaves 100 pixels off the top/bottom and # 500 off the sides so that it evaluates the middle # of the image and not the edges to get an accurate brightness level. $eb = $extra->Shave(geometry=>'500x100'); $eb = $extra->Set(page=>'0x0+0+0'); #Gathers statistics about the current file @estats = $extra->Statistics(); $emean = $estats[3]; $diff = ($blmean - $emean); print "The difference between $extrablank and the reference file is $diff \n"; #Recapturing memory @$extra = (); if ($diff < 0.03) { print "The file $extrablank is blank and is OK. \n"; $extrablank = ($extrablank +1); $extrablank = sprintf("%04d", $extrablank); #account for total pages $newtnum = ($newtnum + 1); print "newtnum recalcuated to $newtnum\n"; } else { print "Please check the body file $extrablank. \n"; print ERR "$doc - mismatch pages - $extrablank is not blank \n"; print ERR3 "$extrablank \n"; &nogood; } print "tflag is now $tflag \n"; } # end until loop if ($tflag == $bdiff) { $bbflag = 1; } else { $bbflag = 0; } if (($newtnum == $fnumber)&& ($bbflag = 1)){ &congrats1; } else { &nogood; } #undefine variables undef $eb; undef $extra; } #end sub troubleshoot sub nogood { print "Please check the document $doc manually.\n"; print "The total number of pages $fnumber does not match the MARC record statement of $tnum pages. \n"; print ERR "$doc - $fnumber files: Preface: $pnum, Body: $bnum, Total: $tnum \n"; print ERR3 "$doc \n"; &moveproblem; } #end sub nogood sub congrats2 { print "Congratulations. This document has passed its quality control check. \n"; &movegood; print "OK blanks array = @okblanks\n"; } #end congrats2 sub moveimagecheck { #moves checked files to IMAGECHECK folder for qualcheck chdir ".."; system "mv $doc ../IMAGECHECK"; next MAIN; } sub moveproblem { #moves checked files to PROBLEM folder chdir ".."; system "mv $doc ../PROBLEM"; next MAIN; } sub readoutfile { #reads $doc.out text file in $doc directory chdir "$doc"; open (OUTFILE, "./$doc.out") or die "$doc.out cannot be opened"; while () { $infoline = $_; chomp ($infoline); print "$infoline \n"; #close OUTFILE; @parseline = split(',', $infoline); #doc is in @parseline[0] just in case $pnum = @parseline[1]; print "preface = $pnum \n"; $firstpage = @parseline[2]; print "first page = $firstpage \n"; $finalpage = @parseline[3]; print "final page = $finalpage \n"; $bnum = ($finalpage - ($firstpage - 1)); print "body BNUM = $bnum \n"; $tnum = ($pnum + $bnum); print "total TNUM = $tnum \n"; } #end while close OUTFILE; } #end sub readoutfile sub movemultiv { chdir "$rdir"; #system "mv $doc ./MULTIV"; #restart next doc next MAIN; } # end sub movemultiv sub readmarc { # Reading the MARC record to determine the number of pages #chdir "$doc"; open (CATFILE, "<$doc.cat") or die "File $doc.cat cannot be opened"; #Testing GMD #Basically looking for something that can pick up roman numeral #prefatory pages and arabic numeral body pages from 300 line while () { # Regular expression retrieves preface & pages # $1 refers to (i|v|x|l|c|m)+) # $2 is not captured using ?: modifier # $3 refers to ([0-9]+) if (m/\|a.((i|v|x|l|c|m)+)(?:,.)([0-9]+)/) { $preface = $1; $pages = $3; } } #end while # Uses the Roman module to translate between Roman and Arabic numerals # in order to perform calculations $pnum = arabic($preface); $bnum = $pages; print "There are $pnum pages of prefatory material. \n"; print "There are $bnum pages in the body of the document. \n"; $tnum = ($pnum + $bnum); print "The TOTAL number of pages should equal: $tnum \n"; close CATFILE; } #end sub readmarc