package BibTeX::Parser; { $BibTeX::Parser::VERSION = '1.95'; } # ABSTRACT: A pure perl BibTeX parser use warnings; use strict; require BibTeX::Parser::Entry; # mutual dependency, so use instead of require my $re_namechar = qr/[a-zA-Z0-9\!\$\&\*\+\-\.\/\:\;\<\>\?\[\]\^\_\`\|\']/o; my $re_name = qr/$re_namechar+/o; sub new { my ( $class, $fh, $opts ) = @_; return bless { fh => $fh, opts => $opts || {}, strings => { jan => "January", feb => "February", mar => "March", apr => "April", may => "May", jun => "June", jul => "July", aug => "August", sep => "September", oct => "October", nov => "November", dec => "December", "j-tugboat" => "TUGboat", # missing definition is irritating }, line => -1, buffer => "", entries => {} }, $class; } sub read { my $self = shift; if (!exists $self->{opts}->{errorlevel}) { $self->{opts}->{errorlevel} = 'warn'; } while (my $entry=$self->next) { if ($entry->parse_ok) { $self->{entries}->{$entry->key()} = $entry; } else { if ($self->{opts}->{errorlevel} eq 'warn') { warn "BibTeX Parser: Skipping entry in line $." } elsif ($self->{opts}->{errorlevel} eq 'error') { exit("BibTeX Parser: Error in line $.") } } } } sub n { my $self = shift; return(scalar keys %{$self->{entries}}); } sub entrykeys { my $self = shift; my @result = keys %{$self->{entries}}; return(\@result); } sub has { my $self = shift; my $key = shift; return(exists($self->{entries}->{$key})); } sub entry { my $self = shift; my $key = shift; return($self->{entries}->{$key}); } sub _slurp_close_bracket; sub _parse_next { my $self = shift; while (1) { # loop until regular entry is finished return 0 if $self->{fh}->eof; local $_ = $self->{buffer}; until (/@/m) { my $line = $self->{fh}->getline; return 0 unless defined $line; $line =~ s/^%.*$//; $_ .= $line; } my $current_entry = new BibTeX::Parser::Entry; if (/@($re_name)/cgo) { my $type = uc $1; $current_entry->type( $type ); my $start_pos = pos($_) - length($type) - 1; # read rest of entry (matches braces) my $bracelevel = 0; $bracelevel += tr/\{/\{/; #count braces $bracelevel -= tr/\}/\}/; while ( $bracelevel != 0 ) { my $position = pos($_); my $line = $self->{fh}->getline; last unless defined $line; $bracelevel = $bracelevel + ( $line =~ tr/\{/\{/ ) - ( $line =~ tr/\}/\}/ ); $_ .= $line; pos($_) = $position; } # Remember text before the entry my $pre = substr($_, 0, $start_pos-1); if ($start_pos == 0) { $pre = ''; } $current_entry->pre($pre); # Remember raw bibtex code my $raw = substr($_, $start_pos); $raw =~ s/^\s+//; $raw =~ s/\s+$//; $current_entry->raw_bibtex($raw); my $pos = pos $_; tr/\n/ /; pos($_) = $pos; if ( $type eq "STRING" ) { if (/\G\{\s*($re_name)\s*=\s*/cgo) { my $key = lc($1); my $value = _parse_string( $self->{strings}, exists $self->{opts}->{"no-warn-ack"} ); # If redefining to the same value, don't worry. # If redefining j-tugboat (predefined above), don't worry, # people use \TUB vs. "TUGboat" arbitrarily. my $old_value = $self->{strings}->{$key}; if ($key ne "j-tugboat" && defined $old_value && $old_value ne $value) { warn("Redefining string $key ", "(oldvalue=$old_value, newvalue=$value"); } $self->{strings}->{$key} = $value; /\G[\s\n]*\}/cg; } else { $current_entry->error("Malformed string! ($raw)"); return $current_entry; } } elsif ( $type eq "COMMENT" or $type eq "PREAMBLE" ) { /\G\{./cgo; _slurp_close_bracket; } else { # normal entry $current_entry->parse_ok(1); # parse key if (/\G\s*\{(?:\s*($re_name)\s*,[\s\n]*|\s+\r?\s*)/cgo) { $current_entry->key($1); # fields while (/\G[\s\n]*($re_name)[\s\n]*=[\s\n]*/cgo) { $current_entry->field( $1 => _parse_string( $self->{strings}, exists $self->{opts}->{"no-warn-ack"} ) ); my $idx = index( $_, ',', pos($_) ); pos($_) = $idx + 1 if $idx > 0; } return $current_entry; } else { $current_entry->error("Malformed entry (key contains invalid characters) at " . substr($_, pos($_) || 0, 20) . ", ignoring"); _slurp_close_bracket; return $current_entry; } } $self->{buffer} = substr $_, pos($_); } else { $current_entry->error("Did not find type at " . substr($_, pos($_) || 0, 20)); return $current_entry; } } } sub next { my $self = shift; return $self->_parse_next; } # slurp everything till the next closing brace. Handles # nested brackets sub _slurp_close_bracket { my $bracelevel = 0; BRACE: { /\G[^\}]*\{/cg && do { $bracelevel++; redo BRACE }; /\G[^\{]*\}/cg && do { if ( $bracelevel > 0 ) { $bracelevel--; redo BRACE; } else { return; } } } } # parse bibtex string in $_ and return. A BibTeX string is either enclosed # in double quotes '""' or matching braces '{}'. The braced form may contain # nested braces. # # Second argument NO_WARN_ACK says whether to emit the warning # "Using undefined string" if the name of the undefined string starts # with "ack-". The default is to warn. The TUGboat config file # ltx2crossrefxml-tugboat.cfg sets this. # # It is an unfortunate fact that people routinely copy bib entries # without copying the "ack-nhfb" or other "ack-..." @string definitions # in Nelson Beebe's bibliography files, resulting in this warning. It is # too irritating to have to define them (they are never used), and also # too irritating to have to see the many warnings on every run. # # Similarly for j-TUGboat, which we predefine in the new() fn above. # sub _parse_string { my ($strings_ref, $no_warn_ack) = @_; $no_warn_ack ||= 0; my $value = ""; PART: { if (/\G(\d+)/cg) { $value .= $1; } elsif (/\G($re_name)/cgo) { my $key = lc($1); if (! defined $strings_ref->{lc($1)}) { #debug_hash("looking up $key for $1", $strings_ref); warn("Using undefined string $1 (", lc($1), ") in: $_") unless $no_warn_ack && $1 =~ /^ack-/; } $value .= $strings_ref->{$1} || ""; } elsif (/\G"(([^"\\]*(\\.)*[^\\"]*)*)"/cgs) { # quoted string with embedded escapes $value .= $1; } else { my $part = _extract_bracketed( $_ ); $value .= substr $part, 1, length($part) - 2; # strip quotes } if (/\G\s*#\s*/cg) { # string concatenation by # redo PART; } } $value =~ s/[\s\n]+/ /g; return $value; } sub _extract_bracketed { for($_[0]) # alias to $_ { /\G\s+/cg; my $start = pos($_); my $depth = 0; while(1) { /\G\\./cg && next; /\G\{/cg && (++$depth, next); /\G\}/cg && (--$depth > 0 ? next : last); /\G([^\\\{\}]+)/cg && next; last; # end of string } return substr($_, $start, pos($_)-$start); } } # Split the $string using $pattern as a delimiter with # each part having balanced braces (so "{$pattern}" # does NOT split). # Return empty list if unmatched braces sub _split_braced_string { my $string = shift; my $pattern = shift; my @tokens; return () if $string eq ''; my $buffer; while (!defined pos $string || pos $string < length $string) { if ( $string =~ /\G(.*?)(\{|$pattern)/cgi ) { my $match = $1; if ( $2 =~ /$pattern/i ) { $buffer .= $match; push @tokens, $buffer; $buffer = ""; } elsif ( $2 =~ /\{/ ) { $buffer .= $match . "{"; my $numbraces=1; while ($numbraces !=0 && pos $string < length $string) { my $symbol = substr($string, pos $string, 1); $buffer .= $symbol; if ($symbol eq '{') { $numbraces ++; } elsif ($symbol eq '}') { $numbraces --; } pos($string) ++; } if ($numbraces != 0) { return (); } } else { $buffer .= $match; } } else { $buffer .= substr $string, (pos $string || 0); last; } } push @tokens, $buffer if $buffer; return @tokens; } # sub debug_hash { my ($label) = shift; my (%hash) = (ref $_[0] && $_[0] =~ /.*HASH.*/) ? %{$_[0]} : @_; my $str = "$label: {"; my @items = (); for my $key (sort keys %hash) { my $val = $hash{$key}; $key =~ s/\n/\\n/g; $val =~ s/\n/\\n/g; push (@items, "$key:$val"); } $str .= join (",", @items); $str .= "}"; warn ($str); } # sub debug_list { my ($label) = shift; my (@list) = (ref $_[0] && $_[0] =~ /.*ARRAY.*/) ? @{$_[0]} : @_; my $str = "$label [" . join (",", @list) . "]"; warn $str; } # Return string representation of call stack for debugging. # sub backtrace { my $ret = ""; my ($line, $subr); my $stackframe = 1; # skip ourselves while ((undef,undef,$line,$subr) = caller ($stackframe)) { $ret .= " -> $subr.$line"; $stackframe++; } return $ret; } 1; # End of BibTeX::Parser __END__ =pod =head1 NAME BibTeX::Parser - A pure perl BibTeX parser =head1 SYNOPSIS Parses BibTeX files. use BibTeX::Parser; use IO::File; my $fh = IO::File->new("filename"); # Create parser object ... my $parser = BibTeX::Parser->new($fh, $opts); # ... and either iterate over entries while (my $entry = $parser->next ) { if ($entry->parse_ok) { my $type = $entry->type; my $title = $entry->field("title"); my @authors = $entry->author; # or: my @editors = $entry->editor; foreach my $author (@authors) { print $author->first . " " . $author->von . " " . $author->last . ", " . $author->jr; } } else { warn "Error parsing file: " . $entry->error; } } # ... or read all entries at once $parser->read(); my $num_entries = $parser->n(); my $entrykeys = $parser->entrykeys(); foreach my $key (@{$entrykeys}) { my $entry = $parser->entry($key); ... } if ($parser->has{'thekey') { $theentry = $parser->entry('thekey'); } =head1 DESCRIPTION There are two interfaces for BiBTeX::Parser: the serial one and the caching one. The serial interface can be used for very large files. It reads entries one by one, and outputs them using $parser->next() function. In other cases you might be better off with the caching interface. It reads all the entries at once, and can output the list of keys or the entry with the given key. =head1 FUNCTIONS =head2 new Creates new parser object. Parameters: * fh: A filehandle * opts: a refernce to the hash of options. Among other options is 'errorlevel', used in the caching interface when processing a non-parseable entry. Can be 'warn' (the default), 'erorr', or 'ignore'. Note that in serial interface the error processing is the responsibility of the user. =head2 next Returns the next parsed entry or undef. Cannot be combined with the L interface described below. =head2 read Read all the entries from the filehandle in the internal cache. =head2 n Returns the number of entries after L function has been called =head2 entrykeys Returns the array of keys after L function has been called =head2 entry Returns an entry with the given key after L function has been called Parameters: * key: the key =head2 has Returns true if the cached file has the given entry Parameters: * key: the key =head1 NOTES The fields C and C are canonicalized, see L. =head1 SEE ALSO =over 4 =item L =item L =back =head1 VERSION version 1.95 =head1 AUTHOR Gerhard Gossen and Boris Veytsman and Karl Berry =head1 COPYRIGHT AND LICENSE Copyright 2013-2026 Gerhard Gossen, Boris Veytsman, Karl Berry This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut