#!/usr/bin/perl -w

use FindBin;
use lib "$FindBin::Bin/../perl_lib";

######################################################################
#
#
######################################################################

=pod

=for Pod2Wiki

=head1 NAME

B<check_xapian> - Checks that Xapian is successfully indexing eprint items.

=head1 SYNOPSIS

B<check_xapian> I<repository_id> [B<options>] 

=head1 DESCRIPTION

This script checks that eprint items have successfully been indexed by Xapian.  It does this my using "/usr/bin/quest" to search the Xapian database filesystem for the title of the particular item.  This requires the 'xapian-tools' package to be installed.  If the title does not contain any special characters or quote marks it will attempt to search for the title within quote marks.  If this is not the case or this returns no matching results, it will search for all the terms in the title in an attempt to find a result.  The script will print a success message to the standard output if a result with the correct EPrint ID is returned otherwise it will print a failure message to the standard error.  A summary message will be printed before the script exits totalling the number of items found not to be indexed.

=head1 ARGUMENTS

=over 8

=item B<repository_id> 

The ID of the eprint repository to use.

=back

=head1 OPTIONS

=over 8

=item B<--all> 

Check all eprint items rather than those last modified in the last 2 days.

=item B<--idlist=1,2,3> 

Check all items list in the comma-separated list of IDs.

=item B<--facets> 

Check search results using the facets matching protocol rather than the standard Xapian matching protocol.

=item B<--days=n>

Check eprint items last modified in the last n days rather than the last 2 days.

=item B<--results=n>

Only check the top n results ordered by relevance.  By default only check first 1000 results.

=item B<--help>

Print a brief help message and exit.

=item B<--man>

Print the full manual page and then exit.

=item B<--path=/path/to/xapian/directory/> 

Specify the directory of the Xapian database if it is not EPRINTS_PATH/archives/ARCHIVEID/var/xapian/.

=item B<--quiet>

This option will suppress all output unless an error occurs.

=item B<--random>

Takes a random sample of 100 eprint items to check whether they are indexed.  Can be used with --sample=n to specify a different random sample size.  Can be used with --days to specify how recently the random sample should taken from.

=item B<--reindex> 

If a item is not found then schedule it for reindexing.

=item B<--sample=n>

Used in conjunction with --random to specify the sample size of items to check. By default sample size is 100.

=item B<--status=archive,deletion>

Used to specify the status of eprint (inbox, buffer, archive, deletion) to be check.  If not specified eprints of all statuses will be checked. Specify multiple statuses through a comma-separated list. Ignored if used in conjunction with --idlist.

=item B<--user=admin>

Used in conjunction with --reindex to specify the username of the user who should be set as the creator for indexing tasks.  If not set the first admin or local_admin user created will be used.

=item B<--verbose>

Explain in detail what is going on.

=item B<--version>

Output version information and exit.

=back   


=cut

use EPrints;

use 5.010001;
use Time::Piece;
use Data::Dumper;
use strict;
use Getopt::Long;
use Pod::Usage;

my $version = 0;
my $verbose = 0;
my $quiet = 0;
my $help = 0;
my $man = 0;
my $all = 0;
my $reindex = 0;
my $random = 0;
my $facets = 0;
my $idlist_opt;
my $path_opt;
my $days_opt;
my $results_opt;
my $sample_opt;
my $status_opt;
my $user_opt;

Getopt::Long::Configure("permute");

GetOptions(
	'help|?' => \$help,
	'man' => \$man,
	'version' => \$version,
	'verbose+' => \$verbose,
	'silent' => \$quiet,
	'quiet' => \$quiet,
	'all' => \$all,
	'idlist=s' => \$idlist_opt,
	'random' => \$random,
	'reindex' => \$reindex,
	'facets' => \$facets,
	'path=s' => \$path_opt,
	'days=s' => \$days_opt,
	'results=s' => \$results_opt,
	'sample=s' => \$sample_opt,
	'status=s' => \$status_opt,
	'user=s' => \$user_opt,
) || pod2usage( 2 );
EPrints::Utils::cmd_version( "check_xapian" ) if $version;
pod2usage( 1 ) if $help;
pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;
pod2usage( 2 ) if( scalar @ARGV != 1 );

my $noise = 1;
$noise = 0 if( $quiet );
$noise = 1+$verbose if( $verbose );

my $repoid = $ARGV[0];
my $session = EPrints::Session->new( 1 , $repoid, $noise );
if( !defined $session )
{
	print STDERR "Failed to load repository: $repoid\n";
	exit 1;
}

my $path = $session->config( "base_path" ).'/archives/'.$ARGV[0].'/var/xapian/';
$path = $session->config( "base_path" ).'/archives/'.$ARGV[0].'/var/xapian.facet/' if $facets;
if ( defined $path_opt )
{
	$path = $path_opt;
}

unless ( -e $path."/record.DB" || -e $path."/iamglass" )
{
	print STDERR "\nError: No Xapian database directory at '$path'.\n\n";
	exit 1;
}

my $quest_location = `/usr/bin/which quest`;
$quest_location =~ s/\n//g;
my $delve_location = "";
$delve_location = `/usr/bin/which delve` if -e $path."/record.DB";
$delve_location = `/usr/bin/which xapian-delve` if -e $path."/iamglass";
$delve_location =~ s/\n//g;

if ( "$quest_location" eq "" )
{
	print STDERR "\nError: Failed to find 'quest' command line tool.\n\n";
	exit 1;
}

if ( "$delve_location" eq "" )
{
	print STDERR "\nError: Failed to find 'delve' command line tool.\n\n";
	exit 1;
}

my $period = "";
my $days = 2;
if ( defined $days_opt )
{
	if ( $days_opt =~ m/[1-9][0-9]*/ )
	{
		$days = int($days_opt);
	}
	else
	{
		print STDERR "\nError: $days_opt is not a positive integer for --days.\n\n";
		exit 1;
	}
}
my $numresults = 1000;
if ( defined $results_opt )
{
	if ( $results_opt =~ m/[1-9][0-9]*/ )
	{
		$numresults = int($results_opt);
	}
	else
	{
		print STDERR "\nError: $results_opt is not a positive integer for --results.\n\n";
		exit 1;
	}
}
my $idlist = "";
if ( defined $idlist_opt )
{
	if ( $idlist_opt !~ /[0-9,]+/ )
	{
		print STDERR "\nError: List of IDs is invalid.\n\n";
		exit 1;
	}
	$idlist = $idlist_opt;
}

my $sample = 100000;
$sample = 100 if $random;
if ( defined $sample_opt )
{
	unless ( $random || $idlist )
	{
		print STDERR "\nError: --sample can only be used with --random or --idlist\n\n";
		exit 1;
	}
	if ( $sample_opt =~ m/[1-9][0-9]*/ )
	{
		$sample = int($sample_opt);
	}
	else
	{
		print STDERR "\nError: $sample_opt is not a positive integer.\n\n";
		exit 1;
	}
}

my $status = undef;
if ( defined $status_opt )
{
	my $status_error = 0;
	my @statuses = split( ',', $status_opt ); 
	for my $a_status ( @statuses )
	{
		unless ( grep ( /^$a_status$/, ( 'inbox', 'buffer', 'archive', 'deletion' ) ) )
		{
			$status_error = 1;
		}
	}
	if ( $status_error )
	{
		print STDERR "\nError: $status_opt is not a valid status or list of statuses.\n\n";
		exit 1;
	}
	$status = join( " ", @statuses );
}

my $inc_rows = 2;
my $id_pos = $inc_rows - 1;
my $no_result_lines = 3;

print "\nRunning $0 at ".localtime()."\n\n" unless $quiet;

my $repo = $session->get_repository;
my $db = $session->get_database();

my $user;
if ( $reindex )
{
	if ( defined $user_opt )
	{
		$user = EPrints::DataObj::User::user_with_username( $session, $user_opt );
		unless ( defined $user )
		{	
			print STDERR "\nError: No user found for username '$user_opt'.\n\n";
			exit 1;
		}
	}
	else 
	{
		my $user_ds = $repo->dataset( "user" );
		my $admins = $user_ds->search( filters => [ { meta_fields => [ 'usertype' ], value => 'local_admin admin', match => 'IN', describe=>0 } ], custom_order => "userid");
		if ($admins->count() == 0)
		{	
			print STDERR "\nError: No admin user found to set as creator of indexing task.\n\n";
			exit 1;
		}
		$user = $admins->item(0);
	}
}

my $ds = $repo->dataset( "eprint" );
my $list;
if ( ( $random && defined($days_opt) ) || ( !$random && !$all && !$idlist ) )
{
	$period = " in the last $days days";
	my $since = localtime() - $days*24*60*60;
	my $since_timestamp = localtime($since)->strftime('%F-');
	my $searchexp = new EPrints::Search( session=>$session, dataset=>$ds, custom_order=>"-lastmod" );
	$searchexp->add_field( fields => [ $ds->get_field( "lastmod" ) ], value => $since_timestamp, describe => 0 );
	$searchexp->add_field( fields => [ $ds->get_field( "eprint_status" ) ], value => $status, match => 'IN', describe => 0 ) if defined $status;
	$list = $searchexp->perform_search;	
}

my @ids = ();
if ( $idlist )
{
	@ids = split( ",", $idlist );
}
elsif ( $random ) 
{
	if ( defined $list )
	{
		my $ids_ref = $list->ids();
		@ids = @$ids_ref;
	}
	else
	{
		my $sql = "SELECT eprintid FROM eprint";
		$sql .= " WHERE eprint_status = '".$status."'" if $status;
		$sql .= ";";
		my $statement = $db->prepare($sql);
		$db->execute($statement, $sql);
		while (my $row = $statement->fetchrow_hashref)
		{
			push @ids, $row->{eprintid};
		}
	}
}
if ( @ids )
{
	my @chosen = ();
	my $ids_size = @ids;
	if ( $ids_size > $sample )
	{
		for ( my $i = 0; $i < $sample; $i++ )
		{  
			my $rand = int(rand($ids_size));
			while ( grep $_ eq $ids[$rand], @chosen )
			{
				$rand = int(rand($ids_size));
			}
			push @chosen, $ids[$rand];
		}
	}
	else
	{
		@chosen = @ids;
	}
	$list = EPrints::List->new( repository => $repo, dataset => $ds, ids => \@chosen );
}

unless ( defined $list )
{
	if ( defined $status )
	{
		my $searchexp = new EPrints::Search( session=>$session, dataset=>$ds, custom_order=>"-lastmod" );
		$searchexp->add_field( fields => [ $ds->get_field( "eprint_status" ) ], value => $status, match => 'IN', describe => 0 ) if defined $status;
		$list = $searchexp->perform_search;
	}
	else
	{
		$list = $ds->search;
	}
}

my $counter = 0;
my $unfound = 0;
my $status_wrong = 0;
my $no_eprint = 0;
my $no_title = 0;

my $listcount = $list->count;

print "Checking ".$listcount." items for Xapian search indexing.\n\n" if $verbose;

for ( my $i = 0; $i < $listcount; $i++ )
{
	my $progress = $i+1;
	$progress .= "/$listcount:";
	my $eprint = $list->item( $i );
	unless ( defined $eprint )
	{
		print "$progress No EPrint exists in position $i of results list.\n";
		$no_eprint++;
		$counter++;
		next;
	}
	next unless defined $eprint;
	my $found = 0;
	my $status_correct = 0;
	my $year_correct = 0;
	my $type_correct = 0;
	my $errfacets = "";
	my $title = $eprint->value('title');
	unless (ref($title) eq "")
	{
		$title = $title->[0]{'text'};
	}
	unless ( defined $title )
	{
		print "$progress No title for EPrint ID ".$eprint->id.".\n";
		$no_title++;
		$counter++;
		next;
	}
	# Remove leading and trailing double-quotes marks so as not to confuse with part of query
	$title =~ s/^\"//g;
	$title =~ s/\"$//g;
	# Escape double quote marks and backticks.
	$title =~ s/\"/\\"/g;
	$title =~ s/\`/\\`/g;
	# Ignore $ symbols (replace with spaces).
	$title =~ s/\$/ /g;
	my @results = ();
	my $command = $quest_location.' -f default -m '.$numresults.' -d '.$path.' "\"'.$title.'\""';
	print "$command\n" if $verbose;
	@results = `$command | grep -va 12345678`;
	if ( scalar @results <= $no_result_lines )
	{
		my $command = $quest_location.' -f default -m '.$numresults.' -d '.$path.' "'.$title.'"';
		print "$command\n" if $verbose;
		@results = `$command | grep -va 12345678`;
	}
	if ( scalar @results <= $no_result_lines )
	{
		print "$progress No results from EPrint ID ".$eprint->id.". ";
	}
	else
	{
		my $position = 0;
		my $line = 0;
		while ($results[$line] !~ m/MSet:/ && $line < scalar @results)
		{
			$line++;
		}
		for ( my $l = $line; $l < scalar @results; $l = $l + $inc_rows )
		{
			next if $facets && $results[$l] !~ m/id/;
			$results[$l] =~ s/[^\x0A-\x7F]//g;
			$results[$l] =~ s/id$//g;
			next unless $results[$l] =~ m/^[1-9][0-9]*$/;
			$position++;
			if ( int($results[$l]) == $eprint->id )
			{
				$found = 1;
				my $xapian_id = $results[$l-$id_pos];
				$xapian_id =~ s/^([0-9]+).*$/$1/;	
				$xapian_id =~ s/\n//;
				my $delve_command = $delve_location.' -1 -r '.$xapian_id.' -d '.$path.' | grep -ae "^_\?eprint_status:" | sed "s/_\?eprint_status://g"';
				my @delve_results = `$delve_command`;	
				if ( defined($delve_results[0]) )
				{
					$delve_results[0] =~ s/\n//;
					$status_correct = 1 if $delve_results[0] eq $eprint->get_value( "eprint_status" );
				}
				my $year = defined $eprint->get_value('date') ? substr( $eprint->get_value('date'), 0, 4 ) : "";
				$year_correct = 1 if $year eq "";
				if ( $year ne "" )
				{
					$delve_command = $delve_location.' -1 -r '.$xapian_id.' -d '.$path.' | grep -ae "^_\?year:" | sed "s/_\?year://g"';
					$delve_command = $delve_location.' -1 -r '.$xapian_id.' -d '.$path.' | grep -ae "^_\?date:" | sed "s/_\?date://g"' if $facets == 0;
					@delve_results = `$delve_command`;
					if ( defined($delve_results[0]) )
					{
						$delve_results[0] =~ s/\n//;
						my $date = $eprint->get_value('date');
						$date = substr( $date, 0, 4 ) if $facets;
						$year_correct = 1 if $delve_results[0] eq $date;
					}
					else
					{
						$year_correct = 1;
					}
				}
				$delve_command = $delve_location.' -1 -r '.$xapian_id.' -d '.$path.' | grep -ae "^_\?type:" | sed "s/_\?type://g"';
				@delve_results = `$delve_command`;
				$type_correct = 1 if !defined $eprint->get_value( 'type' );
				if ( defined($delve_results[0]) )
				{
					$delve_results[0] =~ s/\n//;
					$type_correct = 1 if !defined $eprint->get_value( 'type' ) || lc $delve_results[0] eq lc $eprint->get_value( 'type' );
				}
				last;
			}
		}
		print "$progress No matching results from EPrint ID ".$eprint->id."." unless $found;
		$errfacets .= "status " unless $status_correct;
		$errfacets .= "year " unless $year_correct;
		$errfacets .= "type " unless $type_correct;
		print "$progress Found EPrint ID ".$eprint->id." in position $position but ".$errfacets."is/are not correct." if $found && "$errfacets" ne "";
		print "$progress Found EPrint ID ".$eprint->id." in position $position.\n" if $found && "$errfacets" eq "" && !$quiet;
	}
	if ( ( !$found || $errfacets ne "" ) && $reindex )
	{
		print " Will reindex.\n";
		EPrints::DataObj::EventQueue->create_unique( $session, {
			pluginid => "Event::Indexer",
			action => "index_all",
			params => [$eprint->internal_uri],
			userid => $user->id,
		});
		my @documents = $eprint->get_all_documents;
		foreach my $document ( @documents )
		{
			EPrints::DataObj::EventQueue->create_unique( $session, {
				pluginid => "Event::Indexer",
				action => "removed",
				params => ['document', $document->id ],
				userid => $user->id,
			});
		}
		if ( scalar @documents > 0 )
		{
			EPrints::DataObj::EventQueue->create_unique( $session, {
				pluginid => "Event::Indexer",
				action => "index",
				params => [$eprint->internal_uri, 'documents'],
				userid => $user->id,
			});
		}
	}
	print "\n" if !$reindex && ( !$found || "$errfacets" ne "" );
	$unfound++ unless $found;
	$status_wrong++ if $errfacets ne "" && $found;
	$counter++;
}
print "\n$no_eprint/$counter IDs had no equivalent EPrints so could not be searched in Xapian index." if $no_eprint && !$quiet;
print "\n$no_title/$counter EPrints had no title so could not be searched in Xapian index." if $no_title && !$quiet;
print "\n$unfound/$counter EPrints were found not to be indexed by Xapian$period.\n" unless $quiet;
print "$status_wrong/$counter EPrints were found to be indexed by Xapian$period but whose status, year or type were not set correctly.\n" unless $quiet;

$session->terminate();
exit;


=head1 COPYRIGHT

=for COPYRIGHT BEGIN

Copyright 2023 University of Southampton.
EPrints 3.4 is supplied by EPrints Services.

http://www.eprints.org/eprints-3.4/

=for COPYRIGHT END

=for LICENSE BEGIN

This file is part of EPrints 3.4 L<http://www.eprints.org/>.

EPrints 3.4 and this file are released under the terms of the
GNU Lesser General Public License version 3 as published by
the Free Software Foundation unless otherwise stated.

EPrints 3.4 is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with EPrints 3.4.
If not, see L<http://www.gnu.org/licenses/>.

=for LICENSE END
