#!perl
use strict;
use Data::Dumper;
use Carp;

#
# This is a SAS Component
#

=head1 fids_to_protein_sequences


fids_to_protein_sequences allows the user to look up the amino acid sequences
corresponding to each of a set of fids.  You can also get the sequence from proteins (i.e., md5 values).
This routine saves you having to look up the md5 sequence and then accessing
the protein string in a separate call.


Example:

    fids_to_protein_sequences [arguments] < input > output

The standard input should be a tab-separated table (i.e., each line
is a tab-separated set of fields).  Normally, the last field in each
line would contain the identifer. If another column contains the identifier
use

    -c N

where N is the column (from 1) that contains the subsystem.

This is a pipe command. The input is taken from the standard input, and the
output is to the standard output.

=head2 Documentation for underlying call

This script is a wrapper for the CDMI-API call fids_to_protein_sequences. It is documented as follows:

  $return = $obj->fids_to_protein_sequences($fids)

=over 4

=item Parameter and return types

=begin html

<pre>
$fids is a fids
$return is a reference to a hash where the key is a fid and the value is a protein_sequence
fids is a reference to a list where each element is a fid
fid is a string
protein_sequence is a string

</pre>

=end html

=begin text

$fids is a fids
$return is a reference to a hash where the key is a fid and the value is a protein_sequence
fids is a reference to a list where each element is a fid
fid is a string
protein_sequence is a string


=end text

=back

=head2 Command-Line Options

=over 4

=item -c Column

This is used only if the column containing the subsystem is not the last column.

=item -i InputFile    [ use InputFile, rather than stdin ]

=item -fasta

This is used to request a fasta output file (dropping all of the other columns in the input lines).
It defaults to outputing just a fasta entry.

=back

=head2 Output Format

The standard output is jsut a fasta file with the sequence.  You can also get
a tab-delimited file by using -fasta=0.  The tab-delimited format consists of the input
file with an extra column of sequence  added.

Input lines that cannot be extended are written to stderr.

=cut


my $usage = "usage: fids_to_protein_sequences [-c column] < input > output";

use Bio::KBase::CDMI::CDMIClient;
use Bio::KBase::Utilities::ScriptThing;

my $column;

my $input_file;
my $fasta = 1;

my $kbO = Bio::KBase::CDMI::CDMIClient->new_for_script('c=i' => \$column,
				     'fasta=i' => \$fasta,
				      'i=s' => \$input_file);
if (! $kbO) { print STDERR $usage; exit }

my $ih;
if ($input_file)
{
    open $ih, "<", $input_file or die "Cannot open input file $input_file: $!";
}
else
{
    $ih = \*STDIN;
}

my %fasta_written;   # to remove any possible duplicates
while (my @tuples = Bio::KBase::Utilities::ScriptThing::GetBatch($ih, undef, $column)) {
    my @h = map { $_->[0] } @tuples;
    my $h = $kbO->fids_to_protein_sequences(\@h);
    for my $tuple (@tuples) {
        #
        # Process output here and print.
        #
        my ($id, $line) = @$tuple;
        my $v = $h->{$id};
        if (! defined($v))
        {
            print STDERR $line,"\n";
        }
        elsif (ref($v) eq 'ARRAY')
        {
            foreach $_ (@$v)
            {
		if ($fasta)
		{
		    if (! $fasta_written{$id})
		    {
			$fasta_written{$id} = 1;
			print ">$id\n$_\n";
		    }
		}
		else
		{
		    print "$line\t$_\n";
		}
            }
        }
        else
        {
	    if ($fasta)
	    {
		if (! $fasta_written{$id})
		{
		    $fasta_written{$id} = 1;
		    print ">$id\n$v\n";
		}
	    }
	    else
	    {
		print "$line\t$v\n";
	    }
        }
    }
}
