#!/usr/bin/perl -w
use strict;

# This is set to where Swish-e's "make install" installed the helper modules.
use lib ( '/usr/local/lib/swish-e/perl' );

###################################################################################
#
#    Copyright (C) 2001 Bill Moseley swishscript@hank.org
#    Program to test the SWISH::Filter module
#
#    This program is free software; you can redistribute it and/or
#    modify it under the terms of the GNU General Public License
#    as published by the Free Software Foundation; either version
#    2 of the License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    The above lines must remain at the top of this program
#
#    $Id: swish-filter-test.in,v 1.7 2005/06/09 18:58:24 augur Exp $
#
####################################################################################

use Getopt::Long;
use SWISH::Filter;
use Pod::Usage;
use URI;

use constant ABORT => 0;
use constant DEBUG => 1;
use constant INFO  => 2;

my ( $verbose, $show_content, @file, @url, $help, $man, $quiet, $headers, $path, $depreciated, $mimetypes);

my $skip_binary = 1;

my $lines = 10;
my $max_chars = 1000;


GetOptions(
        'verbose!'   => \$verbose,  # turn on INFO messages
        'content!'   => \$show_content,
        'quiet'      => \$quiet,
        'lines=i'    => \$lines,
        'help|?'     => \$help,
        'man'        => \$man,
        'headers'    => \$headers,
        'skip_binary!' => \$skip_binary,
        'path'       => \$path,
        'depreciated'=> \$depreciated,
        'mimetypes'  => \$mimetypes,

) || pod2usage(2);
pod2usage( -verbose => 1 ) if $help;
pod2usage( -verbose => 2 ) if $man;

if ( $path ) {
    print '/usr/local/lib/swish-e/perl',"\n";
    exit;
}



pod2usage(
    -verbose => 0,
    -message => "Must specify a file or URL",
    -exitvar => 1,
) unless @ARGV or $mimetypes;



$ENV{FILTER_DEBUG} = 1 if $verbose;  # used by SWISH::Filter

msg(INFO, "SWISH::Filter found at [%s]\n", $INC{'SWISH/Filter.pm'} );

my $filter = SWISH::Filter->new;

if ( $mimetypes ) {
    my @filters = $filter->filter_list;
    print "Mimetypes:\n\n";
    for my $filter ( @filters ) {
        print "  $filter:\n";
        for my $pattern ( $filter->mimetypes ) {
            print"     $pattern\n";
        }
        print "\n";
    }
}


my $return = 0;

for my $doc ( @ARGV ) {
    eval { $depreciated ? process_doc_old( $doc ) : process_doc( $doc ) };
    $return = 1 if $@;
    warn "** $0:\n  $@\n" if $@;  # always warn on die
}

exit $return;

sub process_doc {
    my ($file) = @_;


    my $uri;
    eval { $uri = URI->new( $file ) };
    my %config = !$@ && $uri->scheme  ? fetch_url( $file ) : fetch_file( $file);


    my $doc = $filter->convert(
        %config,
        name => $file,
    );

    die "Failed to process document [$file]\n" unless $doc;

    my $content_type = $doc->content_type || "unknown";
    my $parser_type = $doc->swish_parser_type || '';

    my $binary = $doc->is_binary;

    my $msg = $doc->was_filtered ? '' : 'not';

my $name = $doc->name;


    msg(DEBUG, <<EOF );

Document $file was $msg filtered.
   Document:     $file  ($name)
   Content-Type: $content_type
   Parser type:  $parser_type
EOF

    if ( my $filters_used = $doc->filters_used ) {
        for my $filter ( @$filters_used ) {
            msg( DEBUG, "   >Filter used: $filter->{name} ( $filter->{start_content_type} -> $filter->{end_content_type} )" );
         }
    }


    if ( !$binary ) {
        my @doc = split /\n/, substr( ${$doc->fetch_doc}, 0, $max_chars );
        $lines = @doc-1 if $lines >= @doc;
        msg(INFO, join "\n", '-- Output Content Sample --', @doc[0..$lines],'','-- end --','' );
    }


    die "Skipping binary [$file]\n" if $binary && $skip_binary;

    if ($headers ) {
        my $len = length ${$doc->fetch_doc};

        print "Path-Name: $file\nContent-Length: $len\n";
        print "Document-Type: $parser_type\n" if $parser_type;
        print "\n";
    }


    print ${$doc->fetch_doc} if $show_content;
}


sub process_doc_old {
    my ($file) = @_;


    my $uri;
    eval { $uri = URI->new( $file ) };
    my %config = !$@ && $uri->scheme  ? fetch_url( $file ) : fetch_file( $file);


    my $was_filtered = $filter->filter(
        %config,
        name => $file,
    );

    my $content_type = $filter->content_type || "unknown";
    my $orig_content_type = $filter->original_content_type || "unknown";
    my $parser_type = $filter->swish_parser_type || '';

    my $binary = $content_type !~ m[^text/];

    my $msg = $was_filtered ? '' : 'not';

    msg(DEBUG, <<EOF );

Document $file was $msg filtered.
   Document:     $file
   Content-Type: $content_type  (initial was $orig_content_type)
   Parser type:  $parser_type
EOF

    if ( !$binary ) {
        my @doc = split /\n/, substr( ${$filter->fetch_doc}, 0, $max_chars );
        $lines = @doc-1 if $lines >= @doc;
        msg(INFO, join "\n", '-- Output Content Sample --', @doc[0..$lines],'','-- end --','' );
    }


    die "Skipping binary [$file]\n" if $binary && $skip_binary;

    if ($headers ) {
        my $len = length ${$filter->fetch_doc};

        print "Path-Name: $file\nContent-Length: $len\n";
        print "Document-Type: $parser_type\n" if $parser_type;
        print "\n";
    }


    print ${$filter->fetch_doc} if $show_content;
}

sub fetch_file {
   my $file = shift;


   # just try to open for error reporting
   open FH, "<$file" or die "Failed to open '$file': $!\n";

   close FH;

   die "File '$file' has zero size\n" if -z $file;
   return ( document => $file );

}


sub fetch_url {
    my $url = shift;

    eval { require LWP::UserAgent };
    die "LWP::UserAgent is required to fetch a URL\n$@\n" if $@;

    my $ua = LWP::UserAgent->new;
    my $request = HTTP::Request->new('GET', $url );
    my $response = $ua->request( $request );
    die sprintf "Error while getting '%s'.  Server returned %s.", $response->request->uri,$response->status_line
        unless $response->is_success;

    my $content = $response->content;
    my $content_type = $response->content_type;

    return (
        document => \$content,
        content_type => $content_type,
    );
}



sub msg {
    my $msg_level = shift;

    return if $quiet;
    return if !$verbose && $msg_level > DEBUG;
    printf( STDERR @_), print STDERR "\n";
}



__END__

=head1 NAME

swish-filter-test - program to test the SWISH::Filter module.

=head1 SYNOPSIS

swish-filter-test [options] <file or url>  <...>

 Options:
   -quiet           don't generate messages to stderr
   -content         output content to stdout
   -(no)skip_binary skip output of binary files (default)
   -lines <num>     Number of lines of content to display to stderr if verbose
   -headers         output with headers for swish-e -S prog method
   -verbose         enable $ENV{FILTER_DEBUG} for verbose output
   -path            output @INC path to SWISH::Filter module
   -help            brief help message
   -man             full documentation

=head1 DESCRIPTION

swish-filter-test is a program to test the Perl module SWISH::Filter.
SWISH::Filter is a module that is included with the swish-e distribution.

Documents supplied to this script (as a URL or a plain file) on the command line
are passed to the SWISH::Filter module.  This is useful for testing filters.

The SWISH::Filter module works by checking a document's content-type and looking
for an available filter.  Most filters require additional helper programs (e.g.
the filter to convert PDF to HTML requires the Xpdf package).  Using the -verbose
options should indicate if a required program is missing.

Options to this script control how much output is generated.  Options can also be specified
to generate output that can be piped directly to swish-e (see C<-headers> below).

All messages are sent to stderr unless --content or -headers are specified and then content
is sent to stdout.


=head1 OPTIONS

=over 8

=item B<-quiet>

Don't write info out to stderr.  Normally not useful unless you just want to filter
a document and not really test the SWISH::Filter module.

Fatal errors are written to
stderr error regardless of the -quiet option.

=item B<-verbose>

Enables FILTER_DEBUG mode in the SWISH::Filter module, and enables extra info
including a summary of the filtered document to stderr.  Enable if trying to find
out why a filter does not work.


=item B<-lines>

Number of summary lines of summary content to show.  Summary lines are only showed if -verbose
is selected.  Lines are sent to stderr, not stdout.

Note, the summary is limited to 1000 characters regardless of this option.

=item B<-content>

Specifying -content will output full content to stdout.  The default is to only display
a few lines.

=item B<-(no)skip_binary>

The default is to not output content from binary files.  -noskip_binary will disable this.

=item B<-headers>

Prints the headers used by swish-e when reading documents with -S prog.  This can be used to
filter documents directly to swish-e:

 swish-filter-test -headers -content http://localhost/ test.pdf | swish-e -S prog -i stdin -v1

=item B<-path>

Prints the installed location of the SWISH::Filter parent directory for use in PERL5LIB,
Allows using SWISH::Filter in other programs, or with the Swish-e -S http method with
swishspider.

For example:

   PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost


=item B<-help>

Print a brief help message and exits.

=item B<-man>

Prints the manual page and exits.

=back


=cut

