#!/usr/bin/perl

###############################################################################
#   pgimport:  ASCII import utility for PostgreSQL                            #
#   version 0.9.0 2004-01-26                                                  #
#                                                                             #
#   Copyright 2004, Wayne Matthew Syvinski                                    #
#                                                                             #
#   If you use this software, you agree to the following:                     #
#                                                                             #
#   (1) You agree to hold harmless and waive any claims against the author.   #
#   (2) You agree that there is no warranty, express or implied, for this     #
#       software whatsoever.                                                  # 
#   (3) You will abide by the GNU General Public License or the Artistic      #
#       License in the use of this software.                                  #
#   (4) You agree not to modify this notice.                                  #
#                                                                             #
#   If you use this software and find it useful, I would appreciate an        #
#   emailed note to matthew@techcelsior.com                                   #
#                                                                             #
###############################################################################

use strict;
use warnings;

use DBI;
use Text::ParseWords;
BEGIN
{
	eval "use Term::ReadKey";
};
#Term::ReadKey is not part of the standard Perl distribution as of 5.8.1
#Perl installations that do not have it can still run pgimport
#but passwords will appear in cleartext on the terminal

my (@config, $configitem);
my ($fieldnamelist,@fieldnames,$fieldname);
my ($createtable,$insertquery, $insertstub);
my ($tempitem,$tempitem2,@temparray,$i);
my (@timearray);
my ($dbh,$sth);
my ($dataline, $datum, @dataarray, $insertlist);
my ($unpackspec,@speclines,@widths);
my $termkey;
my ($incomingformat,$nativeformat);
my $recordcount;

print STDOUT "+==========================================+\n";
print STDOUT "|pgimport:  PostgreSQL ASCII import utility|\n";
print STDOUT "|   Copyright 2003 Wayne Matthew Syvinski  |\n";
print STDOUT "|                                          |\n";
print STDOUT "|Use allowed under the Artistic License or |\n";
print STDOUT "|the GNU General Public License, version 2 |\n";
print STDOUT "|                or later.                 |\n";
print STDOUT "|                                          |\n";
print STDOUT "|Complete terms and documentation contained|\n";
print STDOUT "|      within source code (POD format)     |\n";
print STDOUT "+==========================================+\n\n";

$recordcount = 0;

if ($@)
{
	$termkey = 0;
}
else
{
	$termkey = 1;
}

if ($ARGV[0] eq 'c')
{
	open INFILE, "<$ARGV[1]" or die qq|File $ARGV[1] does not exist - pgimport exiting\n\n|;
	@config = <INFILE>;
	chomp(@config);
	close INFILE;
}
else
{
	@config = @ARGV;
}

if (!(-e $config[7]))
{
	print STDOUT "File $config[7] does not exist.  Exiting...";
	exit(0);
}
elsif (!(-r $config[7]))
{
	print STDOUT "You do not have permission to read file $config[7].  Exiting...\n";
	exit(0);
}

unless ($config[3])
{
	print STDOUT "A database name is required.  Exiting...\n";
	exit(0);
}

if (($config[0] =~ m/d/i) and (!($config[0])))
{
	print STDOUT "Delimited files require declaration of a delimiter.  Exiting...\n";
	exit(0);
}

@timearray = localtime(time());

foreach $configitem(@config)
{
	if ($configitem =~ m/NULL/i)
	{
		$configitem = q||;
	}
}

unless ($config[2])
{
	$config[2] = '5432';
}

if ($config[8] eq 't')
{
    $config[8] = "\t";
}

unless ($config[4])
{
	print STDOUT "\nEnter your database username: ";
	chomp($config[4] = <STDIN>);
}

unless ($config[5])
{
	#if module Term::ReadKey is not available, 
	#you will have to enter an unmasked, echoed password

	print STDOUT "\nEnter your database password: ";

	if ($termkey)
	{
	    ReadMode('noecho');
		chomp($config[5] = ReadLine(0));
		ReadMode('normal')
	}
	else
	{
		chomp($config[5] = <STDIN>);		
	}
}

#attend to the listing of field names for CREATE TABLE query

if (($config[0] =~ m/d/i) and ($config[9])) #if the first line of the data file contains field names - only good for delimited files
{
	open DATAFILE, "<$config[7]" or die qq|Data file $config[7] does not exist or cannot be opened!\n\n|;
	chomp($fieldnamelist = <DATAFILE>);
	close DATAFILE;
	@fieldnames = quotewords(($config[8] eq '\t'?"\t":$config[8]),0,$fieldnamelist);
}
elsif ($config[10]) #if a file containing field names is provided in @ARGV
{
	open FILESPEC, "<$config[10]";
	chomp(@fieldnames = <FILESPEC>);
	close FILESPEC;
	foreach $fieldname(@fieldnames) #do this in case file is structured as FIELDNAME, LENGTH
	{
		@temparray = split(",",$fieldname);
		$fieldname = $temparray[0];
	}
}
elsif ($config[11])  #if a fieldname list is provided in @ARGV
{
	@fieldnames = quotewords(",",0,$config[11]);
}
else #auto-generate field names as FIELD1, FIELD2 if field names are not provided
{
	if ($config[0] =~ m/d/i) #method for delimited files
	{
		$i = 0;
		open DATAFILE, "<$config[7]";
		chomp($tempitem = <DATAFILE>);
		close DATAFILE;
		@temparray = quotewords(($config[8] eq '\t'?"\t":$config[8]),0,$tempitem);
		@fieldnames = ();
		$tempitem = '';
		foreach $tempitem(@temparray)
		{
			push @fieldnames, "FIELD".$i++;
		}
	}
	elsif ($config[0] =~ m/f/i) #method for fixed-width files
	{
		#only if a mask is used and there is not a fieldlist is this needed
		#if there is a specfile, the fieldnames must be in it
		#if there is a fieldlist, this isn't needed

		$tempitem = $config[8];
		$tempitem =~ s/[0-9 ]//g;
		$tempitem2 = length($tempitem);
		
		@fieldnames = ();
		
		for ($i = 0;$i <= ($tempitem2 - 1); $i++)
		{
			push @fieldnames, "FIELD".$i;
		}

	}
}

unless ($config[6]) #a table name is required - if one wasn't provided, one must be generated
{
	$config[6] = "textimport_".($timearray[5]+1900).substr('0'.($timearray[4]+1),-2).substr('0'.$timearray[3],-2).'_'.substr('0'.$timearray[2],-2).substr('0'.$timearray[1],-2).substr('0'.$timearray[0],-2);
}

$createtable = "CREATE TABLE $config[6] (seqid SERIAL PRIMARY KEY,";
$i = 0;

foreach $fieldname(@fieldnames)
{
	if ($config[12] =~ m/u/i) #option to force uppercase
	{
		$fieldname = uc($fieldname);
	}
	elsif ($config[12] =~ m/l/i) #option to force lowercase
	{
		$fieldname = lc($fieldname);
	}

	$fieldname =~ s/\'//g; #eliminate apostrophes from fieldnames
	$fieldname =~ s/[^a-zA-Z0-9_]/_/g; #NO SPACES and NO DASHES in fieldnames

	$createtable .= "$fieldname TEXT,";
}

chop($createtable); #get rid of last comma - no comma after last field creation statement
$createtable .= ")";

#create the stub of an INSERT query

$insertstub = "INSERT INTO $config[6] (".join(',',@fieldnames).") VALUES(";
	
#if using fixed-width data, an unpack mask is required
if ($config[0] =~ m/f/i)
{
	if ($config[8])
	{
		$unpackspec = $config[8];
	}
	elsif ($config[10])
	{
		open SPECFILE, "<$config[10]";
		@speclines = <SPECFILE>;
		close SPECFILE;
		chomp(@speclines);
		@widths = ();

		foreach $tempitem(@speclines)
		{
			@temparray = quotewords(",",0,$tempitem);
			push @widths,$temparray[1];
		}

		$unpackspec = 'A'.join('A',@widths);				
	}
}

#now create the table and get the data into the database

$dbh = DBI->connect("dbi:Pg:dbname=$config[3];host=$config[1];port=$config[2]","$config[4]","$config[5]");

if ($config[13]) #begin transaction if a bulk commit
{
	$dbh->{AutoCommit}=0;
}

($dbh->{AutoCommit}=0) unless ($config[13]);
$dbh->do($createtable);
$dbh->commit unless ($config[13]);

open DATAFILE, "<$config[7]";
if ($config[9]) #if the first record contains field names, discard it
{
	<DATAFILE>;
}

no warnings;

while ($dataline = <DATAFILE>)
{
	chomp($dataline);
	if ($dataline)
	{
		if ($config[0] =~ m/d/i)
		{
			@dataarray = quotewords(($config[8] eq '\t'?"\t":$config[8]),0,$dataline);
		}
		elsif ($config[0] =~ m/f/i)
		{
			@dataarray = unpack($unpackspec,$dataline);
		}
		
		foreach $datum(@dataarray)
		{
			if (($config[14] == 0) and ($datum))
			{
				$datum =~ s/\'//g;
			}
			else
			{
				$datum =~ s/\'/\\\'/g;
			} 
		}

		$insertlist = q|'|.join(q|','|,@dataarray).q|'|;

		($dbh->{AutoCommit}=0) unless ($config[13]); #start discrete transaction unless bulk transaction in progress

		$dbh->do($insertstub.$insertlist.q|)|);

		$dbh->commit unless ($config[13]); #commit work unless bulk transaction in progress

		$recordcount++;

	}
}

use warnings;

if ($config[13])
{
	$dbh->commit; #end bulk transaction
}

$dbh->disconnect;

print STDOUT "\n\nFINISHED uploading $recordcount records\nto table $config[6] in database $config[3]\n";
print STDOUT "pgimport ".($timearray[5]+1900).'-'.substr('0'.($timearray[4]+1),-2).'-'.substr('0'.$timearray[3],-2).'  '.substr('0'.$timearray[2],-2).':'.substr('0'.$timearray[1],-2).':'.substr('0'.$timearray[0],-2)."\n\n";

exit(0);

=head1 NAME

pgimport - a Perl utility to upload ASCII files into PostgreSQL databases

=head1 SYNOPSIS

B<command-line method:>

B<pgimport> I<mode> I<dbhost> I<dbport> I<dbname> I<dbuser> I<dbpass> I<tablename> I<datafile> I<mask_or_delimiter> I<firstrecord> I<specfile> I<fieldlist> I<forcecase> I<transaction_type> I<apostrophes> 

or

B<configuration file method:>

B<pgimport> B<c> I<configfile>

B<All arguments are mandatory>, but some may be marked as NULL.

=head1 DESCRIPTION

B<pgimport> allows for the import of delimited or fixed-width ASCII text 
into PostgreSQL databases, with automatic table creation upon import.

I wrote this because (1) COPY is limited to the superuser (2) \copy is a crippled version of COPY (3) pgadmin3 and pgaccess have broken import/export functionality (at least on Fedora Core 1 - yes, with the updates installed via yum) (4) I am trying to extricate myself from certain proprietary software products from Redmond, Washington.  A replacement for a certain desktop database is the last piece of the puzzle - and I am only too happy to share with others.

So, in the great tradition of Open-Source, I rolled my own.

Yes, there is a B<pgexport>.

=head1 LICENSE

This software (pgimport) may be used under either the GNU General Public License, version 2 (or at your option, any later version), or the Artistic License.

No warranty or guarantee, either express or implies, exists for this software or for the use of this software.  You use this software at your own risk.

=head1 DEVELOPMENT

B<pgimport> was developed using Fedora Core 1 (updated via yum to 2004-01-26), PostgreSQL 7.3.4, and Perl 5.8.1 i386-linux-thread-multi.

=head1 OPTIONS

B<command-line method>

NOTE:  When NULL is an option, the literal string "NULL" is meant, without any quotation marks.  A "0" (zero) can also be substituted where NULL is used (again, no quotation marks).

=over 4

=item I<mode>

I<mode> is either B<d> if using delimited data, or B<f> if using fixed-width data

=item I<dbhost>

I<dbhost> is the DNS host name or IP address of the PostgreSQL server.  If B<NULL>, then B<localhost> is the default.

=item I<dbport>

I<dbport> is the connection port for the PostgreSQL server.  If B<NULL>, then B<5432> is the default.

=item I<dbname>

I<dbname> is the name of the database on the server.  No default value is available, so you MUST provide this information.

=item I<dbuser>

I<dbuser> is the username by which you intend to connect to the database server.  No default, but if NULL is passed, you will be prompted for a username from the command line.

=item I<dbpass>

I<dbpass> is the password by which you intend to connect to the database server, No default, but if NULL is passed, you will be prompted for a password from the command line.  If Perl module Term::ReadKey is available, the password will be entered no-echo.  If the module is not available, the password will be entered in cleartext.

=item I<tablename>

I<tablename> is the name of the table in the database to which you want to import your data.  Note that the table must not exist - it will be created in the database at upload time.  If NULL is passed, the default is a timestamped name of the form I<textimport_YYYYMMDD_HHMMSS>

=item I<datafile>

I<datafile> is the name of the file containing the data you want to import.  An absolute path is not required, but is best.

=item I<mask_or_delimiter>

I<mask_or_delimiter> works differently depending on whether I<mode> is B<d> or B<f>.  

If I<mode> is B<d>, then the field delimiter is entered here (use \t for tabs).  There is NO default delimiter.

If I<mode> is B<f>, then a pack mask may be given here, or NULL passed.  (See Perl documentation for functions pack() and unpack()).  Pack masks should use only the B<A> template character for ASCII data (i.e. do not use a template character other than B<A> unless you know what you are doing).

=item I<firstrecord>

If I<mode> is B<f>, this must be B<NULL>.

If I<mode> if B<d>, pass a B<1> here to indicate that the datafile contains field names in its first record.  If it does not, pass B<NULL>.

=item I<specfile>

I<specfile> indicates the location of a layout and specification file.  The file should contain records in the form I<recordname>,I<fieldwidth> (yes, comma-separated), with each record entry terminated by a newline (so it should "read down"), although I<fieldwidth> is optional if I<mode> is B<d>. 

If no layout and specification file is used, pass B<NULL>.

=item I<fieldlist>

I<fieldlist> contains a comma-delimited list of field names.  B<NULL> may be passed if you are providing field names another way.

=item I<forcecase>

I<forcecase> can have one of three values:  B<u>, B<l>, B<p>.  

B<p> preserves the case of all values and fieldnames.  This is the default

B<u> forces all values and fieldnames to uppercase.

B<l> forces all values and fieldnames to lowercase.

EXCEPTION:  fieldname seqid, which is the sequence field automatically generated by pgimport, is always lowercase.

=item I<transaction_type>

If I<transaction_type> is B<1>, a bulk transaction will be used.  This is faster, but if one INSERT fails, the entire transaction will be rolled back, and no records will get into the database.  This is the default.

If I<transaction_type> is B<0>, a transaction will be initiated for each INSERT (each record).  This is computationally more intensive and takes longer, but only records that fail insert will not get into the database.  This may be a better choice for "dirty" recordsets.

=item I<apostrophes>

I<apostrophes> controls the insert of single-quote and apostrophe characters.  Attempting to insert them unamended will guarantee failure of the transaction.

If I<apostrophes> is 1, single-quotes and apostrophes will be escaped, preserving them in the database.  This is the default.

If I<apostrophes> is 0, single-quotes and apostrophes are eliminated from values before insertion, and are therefore lost.

=back

B<configuration file method>

=over 4

In the configuration file method, I<mode> is B<c>.  Instead of reading options from the command line, they are read from I<configfile>.  The entries in I<configfile> should appear in the same order, using the same syntax, as for the command-line method, except that a newline should terminate each argument (i.e. the arguments should form a list "reading down").

=back

=head1 INTERNALS

Array @config holds the information obtained from the command line or from the configuration file.  The elements of the array are used as follows.  Please see the appropriate entries in section OPTIONS.

$config[0]: I<mode>

$config[1]: I<dbhost>

$config[2]: I<dbport>

$config[3]: I<dbname>

$config[4]: I<dbuser>

$config[5]: I<dbpass>

$config[6]: I<tablename>

$config[7]: I<datafile>

$config[8]: I<mask_or_delimiter>

$config[9]: I<firstrecord>

$config[10]: I<specfile>

$config[11]: I<fieldlist>

$config[12]: I<forcecase>

$config[13]: I<transaction_type>

$config[14]: I<apostrophes>

=head1 FILES

=over 4

Please see section OPTIONS for required fields, specifically I<datafile> and I<specfile>.

=back

=head1 DIAGNOSTICS AND GOTCHAS

=over 4

I am too lazy to go over every error message here.  Besides, the ones you will get from STDOUT are descriptive enough.

However, some words of note and caution are in order.

1.  Field names must match /[a-zA-Z0-9_]/.  Any character that does not match this pattern will be replaced with an underscore.

2.  Field names are determined in the following order:  (1) I<firstrecord> (only for delimited files  (2) I<specfile>  (3) I<fieldlist>  (4) if no field are indicated, fieldnames will be automatically generated as FIELD0, FIELD1, FIELD2, etc.

3.  The field B<seqid> is always the first field in the newly-created table.  It is a SERIAL field, and is always the primary key.

4.  All fields in the newly-created table are of type TEXT.  The point of the program is to get the data into the database, not to do data scrubbing.

5.  I<tablename> must NOT exist in the database.

6.  For those of you on Unixlike systems, you may want to run dos2unix on text files received from WinDOS land.

=back

=head1 REQUIRES

Perl 5.004 or higher, DBI, DBD::Pg, Text::ParseWords

Term::ReadKey is required for no-echo password entry from the command line when prompted.  If this module is unavailable, the program will still run, but no-echo entry will not be available (you will have to enter your password in clear text from the command line).  See previous entry for I<dbpass>.

=head1 SEE ALSO

pack(), unpack(), DBI, DBD:Pg, Text::ParseWords, Term::ReadKey

=head1 THANKS

Many thanks to the PostgreSQL development team for a database I can live with (meaning inlined functions using PL/Perl), to Larry Wall and his little helpers for the best utility language in the world, and to the contributors to Fedora Core 1.

=head1 AUTHOR

Wayne Matthew Syvinski, matthew@techcelsior.com

=cut


