#!perl
# gmitool - a utility for gemini, part of Net::Gemini
use strict;
use warnings;
use Cpanel::JSON::XS      qw(decode_json encode_json);
use Encode                qw(decode);
use File::Path            qw(make_path);
use File::Slurper         qw(read_text write_text);
use File::Spec::Functions qw(catdir catfile splitpath);
use Getopt::Long          qw(GetOptionsFromArray);
use IO::Socket            qw[PF_INET PF_INET6];
use Net::Gemini 0.10;
use Parse::MIME 'parse_mime_type';
use URI ();

Getopt::Long::Configure('bundling');

my $Address_Family;    # v4 or v6 or (defaults to) any
my $Allow_Verified;    # allow verified certificates
my $Be_Quiet;
my $Force_Update;      # clobber known_hosts?
my $Hosts_Dirty;       # do we need to update known_hosts?
my $Known_Hosts;       # hashref, from/to JSON
my $Show_Status;       # log various things to stderr

die "gmitool: command [args ..]\n" unless @ARGV;

my %commands = ( get => \&get, link => \&link );

my $cmd = shift;
if ( $cmd =~ m{^gemini://} ) {
    unshift @ARGV, $cmd;
    $cmd = 'get';
} elsif ( !exists $commands{$cmd} ) {
    die "gmitool: no such command '$cmd' (",
      join( ' ', sort keys %commands ),
      ")\n";
}
$commands{$cmd}->( \@ARGV );

# gets a gemini page
sub get {
    my ($args) = @_;
    GetOptionsFromArray(
        $args,
        4         => sub { $Address_Family = PF_INET },
        6         => sub { $Address_Family = PF_INET6 },
        A         => \$Allow_Verified,
        'C=s'     => \my $client_cert,
        'E=s'     => \my $output_encoding,
        'H=s'     => \my $sni_host,
        'K=s'     => \my $client_key,
        S         => \$Show_Status,
        'V=s'     => \my $verify,
        f         => \$Force_Update,
        'links|l' => \my $Only_Links,
        q         => \$Be_Quiet,
        't=i',    => \my $timeout
    ) or exit 64;

    my $resource = $args->[0];
    die "Usage: gmitool get [options] url\n"
      unless defined $resource and length $resource;

    # meta is UTF-8 and may appear in STDERR
    # KLUGE this assumes that UTF-8 is correct; maybe this could also
    # use $output_encoding if that is set? (or remove output encoding
    # and insist on things being UTF-8)
    binmode *STDERR, ':encoding(UTF-8)';

    # default if -E not specified and content has a charset
    $output_encoding = ':encoding(UTF-8)' unless defined $output_encoding;

    my ( $known_hosts_file, %param, @unveils );

    if ( defined $client_cert ) {
        $param{ssl}->{SSL_cert_file} = $client_cert;
        push @unveils, [ $client_cert, 'r' ];
    }
    if ( defined $client_key ) {
        $param{ssl}->{SSL_key_file} = $client_key;
        push @unveils, [ $client_key, 'r' ];
    }
    if ( defined $sni_host ) {
        $param{ssl}->{SSL_hostname} = $sni_host;
    }
    $param{family} = $Address_Family if defined $Address_Family;
    $param{ssl}->{Timeout} = $timeout || 30;

    if ( defined $verify ) {
        if ( $verify eq 'peer' ) {
            $param{ssl}->{SSL_verify_mode}     = 1;       # SSL_VERIFY_PEER
            $param{ssl}->{SSL_verify_callback} = undef;
        } elsif ( $verify eq 'none' ) {
            warn "NOTICE no certificate verification\n" if $Show_Status;
            $param{ssl}->{SSL_verify_mode}     = 0;       # SSL_VERIFY_NONE
            $param{ssl}->{SSL_verify_callback} = undef;
        } else {
            die "gmitool: unknown verify mode '$verify'\n";
        }
    } else {
        my $cache_dir;
        if ( length $ENV{GMITOOL_HOSTS} ) {
            $cache_dir        = ( splitpath( $ENV{GMITOOL_HOSTS} ) )[1];
            $known_hosts_file = $ENV{GMITOOL_HOSTS};
        } else {
            die "gmitool: HOME is not set" unless defined $ENV{HOME};
            $cache_dir        = catdir( $ENV{HOME}, qw{.cache gmitool} );
            $known_hosts_file = catfile( $cache_dir, 'known_hosts' );
        }
        make_path($cache_dir);
        my $buf;
        eval { $buf = read_text($known_hosts_file) };
        $Known_Hosts = decode_json($buf) if defined $buf and length $buf;
        push @unveils, [ $cache_dir, 'cw' ];
        @param{qw(tofu verify_ssl)} = ( 1, \&verify_ssl );
    }

    my $pledge = eval {
        require OpenBSD::Pledge;
        require OpenBSD::Unveil;
        OpenBSD::Unveil->import;
        1;
    };
    if ($pledge) {
        OpenBSD::Pledge::pledge(qw{cpath dns inet rpath unveil wpath});
        unveil( $ENV{SSL_CERT_DIR},  'r' ) if exists $ENV{SSL_CERT_DIR};
        unveil( $ENV{SSL_CERT_FILE}, 'r' ) if exists $ENV{SSL_CERT_FILE};
        unveil(qw{/etc/ssl r});
        for my $dir      (@INC)     { unveil( $dir, 'r' ) }
        for my $pathperm (@unveils) { unveil(@$pathperm) }
        unveil();
    }

    my ( $gem, $code );
    my $redirects = -1;

  REQUEST:
    ( $gem, $code ) = Net::Gemini->get( $resource, %param );
    if ( $pledge and $code != 3 ) {
        OpenBSD::Pledge::pledge(qw{cpath rpath wpath});
    }
    if ( $code == 2 ) {
        warn "META " . $gem->meta . "\n" if $Show_Status;
        my ( $type, $sub, $pr ) = parse_mime_type( $gem->meta );
        my ( $encoded, $charset ) = is_encoded($pr);
        $encoded = 0 if defined $output_encoding and $output_encoding eq '';
        if ($Only_Links) {
            if ( $type eq 'text' and $sub eq 'gemini' ) {
                show_links($gem);
                goto CLEANUP;
            }
        }
        if ($encoded) {
            my $body = '';
            $gem->getmore( sub { $body .= $_[0]; 1 } );
            binmode STDOUT, $output_encoding if defined $output_encoding;
            print decode( $charset, $body, Encode::FB_CROAK );
        } else {
            binmode STDOUT, ':raw';    # garbage in, garbage out
            $gem->getmore( sub { syswrite STDOUT, $_[0]; 1 } );
        }
    } elsif ( $code == 0 ) {
        my $e = $gem->error;
        chomp $e;
        die "gmitool: error: $e\n";
    } elsif ( $code == 3 ) {
        die "gmitool: too many redirects ($redirects) " . $gem->meta . "\n"
          if ++$redirects >= 5;    # amfora also uses 5 max redirects
        my $new = $gem->meta;
        $resource = URI->new_abs( $new, $gem->{_uri} );
        warn "REDIRECT " . $resource . "\n" if $Show_Status;
        sleep 1;                   # don't be too quick about a loop
        goto REQUEST;
    } elsif ( $code == 4 ) {
        die 'gmitool: temporary-failure '
          . $gem->status . ' '
          . $gem->meta . "\n";
    } elsif ( $code == 5 ) {
        die 'gmitool: permanent-failure '
          . $gem->status . ' '
          . $gem->meta . "\n";
    } elsif ( $code == 6 ) {
        die 'gmitool: client-certificate '
          . $gem->status . ' '
          . $gem->meta . "\n";
    }

  CLEANUP:
    if ($Hosts_Dirty) {
        write_text( $known_hosts_file, encode_json($Known_Hosts) );
    }
}

# is it encoded in something besides UTF-8, or also besides US-ASCII of
# which UTF-8 is a superset?
sub is_encoded {
    my ($pr) = @_;
    if ( exists $pr->{charset} ) {
        # TODO can US-ASCII appear in any other forms (and how likely
        # are we to see them, and would it even cause a problem?
        return 1, $pr->{charset}
          unless $pr->{charset} =~ m/(?i)utf-8/
          or $pr->{charset} =~ m/(?i)ascii/;
    } else {
        # a server might return 'CHARSET' or maybe 'Charset' or whatever
        # according to the gemini torture tests (see t/torture.t), so
        # look for look for those alternative forms
        for my $key (%$pr) {
            if ( $key =~ m/^(?i)charset$/ ) {
                return 1, $pr->{$key}
                  unless $pr->{$key} =~ m/(?i)utf-8/
                  or $pr->{$key} =~ m/(?i)ascii/;
            }
        }
    }
    return 0, 'UTF-8';
}

# extracts links in text/gemini input
sub link {
    my ($args) = @_;
    GetOptionsFromArray(
        $args,
        'base|b=s'   => \my $base,
        'relative|r' => \my $relative,
    ) or exit 64;
    while ( my $line = readline ) {
        if ( $line =~ m/^=>\s*(\S+)/ ) {
            my $u = defined $base ? URI->new_abs( $1, $base ) : URI->new($1);
            next if $relative and defined $u->scheme;
            print $u->canonical, "\n";
        }
    }
}

# parse and qualify links out of what is assumed to be text/gemini
# content. links can be followed by an optional description:
#   =>/about/
#   => photos/ all the cats
sub show_links {
    my ($gem) = @_;
    my $base = $gem->{_uri};
    # KLUGE in theory may need support encoding, but if we assume that
    # the links are ASCII with anything fancy encoded, and that the
    # environment assumes UTF-8 or ASCII...
    binmode *STDOUT, ':raw';
    my $buf = '';
    my $eom;
    $gem->getmore(
        sub {
            $buf .= $_[0];
            $eom = 0;
            while ( $buf =~ m{^=>[ \t]*(\S+)(?:[ \t]+[^\r\n]*)?[\r\n]}gm ) {
                print URI->new_abs( $1, $base )->canonical, "\n";
                $eom = $+[0];
            }
            substr $buf, 0, $eom, '' if $eom;
            1;
        }
    );
}

# NOTE the host and possibly certificate maybe should be hashed for
# privacy, though the benefits of this seem dubious given how few gemini
# servers there are and other means of collecting connection information
sub verify_ssl {
    my ($param) = @_;
    return 1 if $Allow_Verified and $param->{okay};
    my $key = join '|', @{$param}{qw(host port)};
    my $new = { map { $_ => $param->{$_} }
          qw(digest ip notAfter notBefore okay) };
    # one could save the whole certificate with something like
    #$new->{cert} = Net::SSLeay::PEM_get_string_X509($param->{cert});
    my @fields = qw{notAfter notBefore ip okay};
    if ( !exists $Known_Hosts->{$key} ) {
        ( $Known_Hosts->{$key}, $Hosts_Dirty ) = ( $new, 1 );
    } elsif ($Force_Update) {
        unless ( $Be_Quiet
            or $Known_Hosts->{$key}{digest} eq $param->{digest} ) {
            verify_warn( $key, $Known_Hosts->{$key}, $new, \@fields );
        }
        # merge new over old in the event there are unknown keys that
        # something else added to the cache
        my %update = ( %{ $Known_Hosts->{$key} }, %$new );
        ( $Known_Hosts->{$key}, $Hosts_Dirty ) = ( \%update, 1 );
    } else {
        if ( $Known_Hosts->{$key}{digest} ne $param->{digest} ) {
            verify_warn( $key, $Known_Hosts->{$key}, $new, \@fields );
            exit 1;
        }
    }
    return 1;
}

sub verify_warn {
    my ( $key, $old, $new, $fields ) = @_;
    warn qq(gmitool: digest mismatch "$key"\n);
    for my $dtl (@$fields) {
        if ( exists $old->{$dtl} and $old->{$dtl} ne $new->{$dtl} ) {
            warn "  $dtl\t$old->{$dtl}\t$new->{$dtl}\n";
        }
    }
}

__END__

=head1 NAME

gmitool - a Net::Gemini command line tool

=head1 SYNOPSIS

  gmitool get [-46] [-A] [-E encoding-layer] [-C cert-file -K key-file] \
              [-H sni-host] [-S] [-V peer|none] \
              [-f] [-l] [-q] [-t seconds] \
              gemini://example.org
  gmitool link [-b base-url] < text-gemini-content

=head1 DESCRIPTION

B<gmitool> offers various gemini protocol related utilities. It is part
of the L<Net::Gemini> module. Subcommands include:

=head2 get

Gets a gemini page and prints it to standard out, if all goes well
(garbage in, garbage out). Options:

=over 4

=item C<-4>

Use only IPv4. The default is to use any protocol.

=item C<-6>

Use only IPv6.

=item C<-A>

Accept verified leaf certificates without going through the usual TOFU
path, assuming that the certificate can be verified. Probably good with
sites that use "Let's Encrypt" as these certificates change frequently
and would otherwise need the use of the C<-f> flag to force updates, and
usually will (but may not) verify correctly.

=item C<-C> I<certificate-file>

Client certificate file, use with C<-K> for when B<gmitool> must use a
custom certificate.

=item C<-E> I<encoding-layer>

Specifies the output encoding should the remote content have C<charset>
associated with it; the default in such a case is to convert to UTF-8.
If set to the empty string C<-E ''> then C<:raw> will be used--garbage
in, garbage out. Otherwise, the I<encoding-layer> should be something
Perl supports such as C<:encoding(UTF-8)>.

=item C<-K> I<key-file>

Client private key file, use with C<-C>.

=item C<-H> I<hostname>

Use the given I<hostname> as the SNI host instead of the default that is
taken from the URL given.

=item C<-S>

Show various diagnostic information (the META field, redirects, etc).

=item C<-V> I<mode>

Specifies a custom certificate verification mode. By default Trust On
First Use (TOFU) is used, which only checks the first leaf certificate
against the C<known_hosts> table.

Verification modes include C<peer> to verify the peer certificates (the
full chain), and C<none> to do no verification. See also C<-A>.

The C<SSL_CERT_FILE> and C<SSL_CERT_DIR> environment variables can be
used to customize the trusted certificate authority certificates.

=item C<-f>

Force update of TOFU certificates. Use C<-q> to hide the warnings.
Updates to the cache will not happen if C<-A> is used and the
certificate can be verified.

=item C<-l>

Only show the links of the content if the response is text/gemini.

=item C<-q>

Do not show a warning when force updating a changed certificate.

=item C<-t> I<seconds>

Custom connect timeout, 30 seconds by default.

=back

=head2 link

Extracts link from text/gemini input, and qualifies any relative links
if the C<-b> I<base-url> option is given. Only shows relative links if
the C<-r> option is given.

=head1 ENVIRONMENT

=over 4

=item C<GMITOOL_HOSTS>

Custom known hosts file.

=item C<SSL_CERT_DIR>

Custom directory for SSL certificate authority certificates. The default
is the operating system (OS) default, which could be C</etc/ssl> or
similar. Customize this and the next to specify that only certain
certificate authorities should be trusted, as opposed to everything that
ships with the OS by default.

  env SSL_CERT_DIR=/some/where SSL_CERT_FILE=/dev/null gmitool ...

=item C<SSL_CERT_FILE>

Custom file for SSL certificate authorities.

=back

=head1 FILES

C<~/.cache/gmitool/known_hosts> is where the TOFU records are stored by
default. JSON format, UTF-8 encoding.

=head1 EXIT STATUS

The B<gmitool> utility exits 0 on success, and >0 if an error occurs.

=head1 SEE ALSO

L<Net::Gemini>, L<ftp(1)>, L<openssl(1)>, L<nc(1)>

=head1 AUTHOR

Jeremy Mates

=cut
