#------------------------------------------------------------------------------ # File: PostScript.pm # # Description: Read PostScript meta information # # Revisions: 07/08/2005 - P. Harvey Created # # References: 1) http://partners.adobe.com/public/developer/en/ps/5002.EPSF_Spec.pdf # 2) http://partners.adobe.com/public/developer/en/ps/5001.DSC_Spec.pdf # 3) http://partners.adobe.com/public/developer/en/illustrator/sdk/AI7FileFormat.pdf #------------------------------------------------------------------------------ package Image::ExifTool::PostScript; use strict; use vars qw($VERSION $AUTOLOAD); use Image::ExifTool qw(:DataAccess :Utils); $VERSION = '1.23'; sub WritePS($$); sub ProcessPS($$;$); # PostScript tag table %Image::ExifTool::PostScript::Main = ( PROCESS_PROC => \&ProcessPS, WRITE_PROC => \&WritePS, PREFERRED => 1, # always add these tags when writing GROUPS => { 2 => 'Image' }, # Note: Make all of these tags priority 0 since the first one found at # the start of the file should take priority (in case multiples exist) Author => { Priority => 0, Groups => { 2 => 'Author' }, Writable => 'string' }, BoundingBox => { Priority => 0 }, Copyright => { Priority => 0, Writable => 'string' }, #2 CreationDate => { Name => 'CreateDate', Priority => 0, Groups => { 2 => 'Time' }, Writable => 'string', }, Creator => { Priority => 0, Writable => 'string' }, ImageData => { Priority => 0 }, For => { Priority => 0, Writable => 'string', Notes => 'for whom the document was prepared'}, Keywords => { Priority => 0, Writable => 'string' }, ModDate => { Name => 'ModifyDate', Priority => 0, Groups => { 2 => 'Time' }, Writable => 'string', }, Pages => { Priority => 0 }, Routing => { Priority => 0, Writable => 'string' }, #2 Subject => { Priority => 0, Writable => 'string' }, Title => { Priority => 0, Writable => 'string' }, Version => { Priority => 0, Writable => 'string' }, #2 # these subdirectories for documentation only BeginPhotoshop => { Name => 'PhotoshopData', SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main', }, }, BeginICCProfile => { Name => 'ICC_Profile', SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main', }, }, begin_xml_packet => { Name => 'XMP', SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main', }, }, TIFFPreview => { Binary => 1, Notes => q{ not a real tag ID, but used to represent the TIFF preview extracted from DOS EPS images }, }, BeginDocument => { Name => 'EmbeddedFile', SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main', }, Notes => 'extracted with ExtractEmbedded option', }, EmbeddedFileName => { Groups => { 3 => 'Doc#' }, # (for GetAllGroups()) Notes => q{ not a real tag ID, but the file name from a BeginDocument statement. Extracted with document metadata when ExtractEmbedded option is used }, }, ); # composite tags %Image::ExifTool::PostScript::Composite = ( GROUPS => { 2 => 'Image' }, # BoundingBox is in points, not pixels, # but use it anyway if ImageData is not available ImageWidth => { Desire => { 0 => 'Main:PostScript:ImageData', 1 => 'PostScript:BoundingBox', }, ValueConv => 'Image::ExifTool::PostScript::ImageSize(\@val, 0)', }, ImageHeight => { Desire => { 0 => 'Main:PostScript:ImageData', 1 => 'PostScript:BoundingBox', }, ValueConv => 'Image::ExifTool::PostScript::ImageSize(\@val, 1)', }, ); # add our composite tags Image::ExifTool::AddCompositeTags('Image::ExifTool::PostScript'); #------------------------------------------------------------------------------ # AutoLoad our writer routines when necessary # sub AUTOLOAD { return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_); } #------------------------------------------------------------------------------ # Get image width or height # Inputs: 0) value list ref (ImageData, BoundingBox), 1) true to get height sub ImageSize($$) { my ($vals, $getHeight) = @_; my ($w, $h); if ($$vals[0] and $$vals[0] =~ /^(\d+) (\d+)/) { ($w, $h) = ($1, $2); } elsif ($$vals[1] and $$vals[1] =~ /^(\d+) (\d+) (\d+) (\d+)/) { ($w, $h) = ($3 - $1, $4 - $2); } return $getHeight ? $h : $w; } #------------------------------------------------------------------------------ # Set PostScript format error warning # Inputs: 0) ExifTool object reference, 1) error string # Returns: 1 sub PSErr($$) { my ($exifTool, $str) = @_; # set file type if not done already $exifTool->SetFileType('PS') unless $exifTool->GetValue('FileType'); $exifTool->Warn("PostScript format error ($str)"); return 1; } #------------------------------------------------------------------------------ # set $/ according to the current file # Inputs: 0) RAF reference # Returns: Original separator or undefined if on error sub SetInputRecordSeparator($) { my $raf = shift; my $oldsep = $/; my $pos = $raf->Tell(); # save current position my $data; $raf->Read($data,256) or return undef; my ($a, $d) = (999,999); $a = pos($data), pos($data) = 0 if $data =~ /\x0a/g; $d = pos($data) if $data =~ /\x0d/g; my $diff = $a - $d; if ($diff eq 1) { $/ = "\x0d\x0a"; } elsif ($diff eq -1) { $/ = "\x0a\x0d"; } elsif ($diff > 0) { $/ = "\x0d"; } elsif ($diff < 0) { $/ = "\x0a"; } else { return undef; # error } $raf->Seek($pos, 0); # restore original position return $oldsep; } #------------------------------------------------------------------------------ # Decode comment from PostScript file # Inputs: 0) comment string, 1) RAF ref, 2) reference to lines array # 3) optional data reference for extra lines read from file # Returns: Decoded comment string (may be an array reference) # - handles multi-line comments and escape sequences sub DecodeComment($$$;$) { my ($val, $raf, $lines, $dataPt) = @_; $val =~ s/\x0d*\x0a*$//; # remove trailing CR, LF or CR/LF # check for continuation comments for (;;) { unless (@$lines) { my $buff; $raf->ReadLine($buff) or last; my $altnl = $/ eq "\x0d" ? "\x0a" : "\x0d"; if ($buff =~ /$altnl/) { # split into separate lines @$lines = split /$altnl/, $buff, -1; # handle case of DOS newline data inside file using Unix newlines @$lines = ( $$lines[0] . $$lines[1] ) if @$lines == 2 and $$lines[1] eq $/; } else { push @$lines, $buff; } } last unless $$lines[0] =~ /^%%\+/; # is the next line a continuation? $$dataPt .= $$lines[0] if $dataPt; # add to data if necessary $$lines[0] =~ s/\x0d*\x0a*$//; # remove trailing CR, LF or CR/LF $val .= substr(shift(@$lines), 3); # add to value (without leading "%%+") } my @vals; # handle bracketed string values if ($val =~ s/^\((.*)\)$/$1/) { # remove brackets if necessary # split into an array of strings if necessary my $nesting = 1; while ($val =~ /(\(|\))/g) { my $bra = $1; my $pos = pos($val) - 2; my $backslashes = 0; while ($pos and substr($val, $pos, 1) eq '\\') { --$pos; ++$backslashes; } next if $backslashes & 0x01; # escaped if odd number if ($bra eq '(') { ++$nesting; } else { --$nesting; unless ($nesting) { push @vals, substr($val, 0, pos($val)-1); $val = substr($val, pos($val)); ++$nesting if $val =~ s/\s*\(//; } } } push @vals, $val; foreach $val (@vals) { # decode escape sequences in bracketed strings # (similar to code in PDF.pm, but without line continuation) while ($val =~ /\\(.)/sg) { my $n = pos($val) - 2; my $c = $1; my $r; if ($c =~ /[0-7]/) { # get up to 2 more octal digits $c .= $1 if $val =~ /\G([0-7]{1,2})/g; # convert octal escape code $r = chr(oct($c) & 0xff); } else { # convert escaped characters ($r = $c) =~ tr/nrtbf/\n\r\t\b\f/; } substr($val, $n, length($c)+1) = $r; # continue search after this character pos($val) = $n + length($r); } } $val = @vals > 1 ? \@vals : $vals[0]; } return $val; } #------------------------------------------------------------------------------ # Extract information from EPS, PS or AI file # Inputs: 0) ExifTool object reference, 1) dirInfo reference, 2) optional tag table ref # Returns: 1 if this was a valid PostScript file sub ProcessPS($$;$) { my ($exifTool, $dirInfo, $tagTablePtr) = @_; my $raf = $$dirInfo{RAF}; my $embedded = $exifTool->Options('ExtractEmbedded'); my ($data, $dos, $endDoc); # allow read from data $raf = new File::RandomAccess($$dirInfo{DataPt}) unless $raf; # # determine if this is a postscript file # $raf->Read($data, 4) == 4 or return 0; # accept either ASCII or DOS binary postscript file format return 0 unless $data =~ /^(%!PS|%!Ad|\xc5\xd0\xd3\xc6)/; if ($data =~ /^%!Ad/) { # I've seen PS files start with "%!Adobe-PS"... return 0 unless $raf->Read($data, 6) == 6 and $data eq "obe-PS"; } elsif ($data =~ /^\xc5\xd0\xd3\xc6/) { # process DOS binary file header # - save DOS header then seek ahead and check PS header $raf->Read($dos, 26) == 26 or return 0; SetByteOrder('II'); unless ($raf->Seek(Get32u(\$dos, 0), 0) and $raf->Read($data, 4) == 4 and $data eq '%!PS') { return PSErr($exifTool, 'invalid header'); } } # # set the newline type based on the first newline found in the file # my $oldsep = SetInputRecordSeparator($raf); $oldsep or return PSErr($exifTool, 'invalid PS data'); # set file type (PostScript or EPS) $raf->ReadLine($data) or return 0; $exifTool->SetFileType($data =~ /EPSF/ ? 'EPS' : 'PS') unless $exifTool->{VALUE}->{FileType}; # # extract TIFF information from DOS header # $tagTablePtr or $tagTablePtr = GetTagTable('Image::ExifTool::PostScript::Main'); if ($dos) { my $base = Get32u(\$dos, 16); if ($base) { my $pos = $raf->Tell(); # extract the TIFF preview my $len = Get32u(\$dos, 20); my $val = $exifTool->ExtractBinary($base, $len, 'TIFFPreview'); if (defined $val and $val =~ /^(MM\0\x2a|II\x2a\0|Binary)/) { $exifTool->HandleTag($tagTablePtr, 'TIFFPreview', $val); } else { $exifTool->Warn('Bad TIFF preview image'); } # extract information from TIFF in DOS header # (set Parent to '' to avoid setting FileType tag again) my %dirInfo = ( Parent => '', RAF => $raf, Base => $base, ); $exifTool->ProcessTIFF(\%dirInfo) or $exifTool->Warn('Bad embedded TIFF'); # position file pointer to extract PS information $raf->Seek($pos, 0); } } # # parse the postscript # my ($buff, $mode, $beginToken, $endToken, $docNum, $subDocNum); my (@lines, $altnl); if ($/ eq "\x0d") { $altnl = "\x0a"; } else { $/ = "\x0a"; # end on any LF (even if DOS CR+LF) $altnl = "\x0d"; } for (;;) { if (@lines) { $data = shift @lines; } else { $raf->ReadLine($data) or last; # check for alternate newlines as efficiently as possible if ($data =~ /$altnl/) { # split into separate lines @lines = split /$altnl/, $data, -1; $data = shift @lines; if (@lines == 1 and $lines[0] eq $/) { # handle case of DOS newline data inside file using Unix newlines $data .= $lines[0]; undef @lines; } } } if ($mode) { if (not $endToken) { $buff .= $data; next unless $data =~ m{<\?xpacket end=.(w|r).\?>($/|$)}; } elsif ($data !~ /^$endToken/i) { if ($mode eq 'XMP') { $buff .= $data; } elsif ($mode eq 'Document') { # ignore embedded documents, but keep track of nesting level $docNum .= '-1' if $data =~ /^$beginToken/; } else { # data is ASCII-hex encoded $data =~ tr/0-9A-Fa-f//dc; # remove all but hex characters $buff .= pack('H*', $data); # translate from hex } next; } elsif ($mode eq 'Document') { $docNum =~ s/-?\d+$//; # decrement document nesting level # done with Document mode if we are back at the top level undef $mode unless $docNum; next; } } elsif ($endDoc and $data =~ /^$endDoc/i) { $docNum =~ s/-?(\d+)$//; # decrement nesting level $subDocNum = $1; # remember our last sub-document number $$exifTool{DOC_NUM} = $docNum; undef $endDoc unless $docNum; # done with document if top level next; } elsif ($data =~ /^(%{1,2})(Begin)(_xml_packet|Photoshop|ICCProfile|Document|Binary)/i) { # the beginning of a data block my %modeLookup = ( _xml_packet => 'XMP', photoshop => 'Photoshop', iccprofile => 'ICC_Profile', document => 'Document', binary => undef, # (we will try to skip this) ); $mode = $modeLookup{lc $3}; unless ($mode) { if (not @lines and $data =~ /^%{1,2}BeginBinary:\s*(\d+)/i) { $raf->Seek($1, 1) or last; # skip binary data } next; } $buff = ''; $beginToken = $1 . $2 . $3; $endToken = $1 . ($2 eq 'begin' ? 'end' : 'End') . $3; if ($mode eq 'Document') { # this is either the 1st sub-document or Nth document if ($docNum) { # increase nesting level $docNum .= '-' . (++$subDocNum); } else { # this is the Nth document $$exifTool{DOC_COUNT} = ($$exifTool{DOC_COUNT} || 0) + 1; $docNum = $$exifTool{DOC_COUNT}; } $subDocNum = 0; # new level, so reset subDocNum next unless $embedded; # skip over this document # set document number for family 4-7 group names $$exifTool{DOC_NUM} = $docNum; $$exifTool{LIST_TAGS} = { }; # don't build lists across different documents $exifTool->{PROCESSED} = { }; # re-initialize processed directory lookup too $endDoc = $endToken; # parse to EndDocument token # reset mode to allow parsing into sub-directories undef $endToken; undef $mode; # save document name if available if ($data =~ /^$beginToken:\s+([^\n\r]+)/i) { my $docName = $1; # remove brackets if necessary $docName = $1 if $docName =~ /^\((.*)\)$/; $exifTool->HandleTag($tagTablePtr, 'EmbeddedFileName', $docName); } } next; } elsif ($data =~ /^<\?xpacket begin=.{7,13}W5M0MpCehiHzreSzNTczkc9d/) { # pick up any stray XMP data $mode = 'XMP'; $buff = $data; undef $endToken; # no end token (just look for xpacket end) # XMP could be contained in a single line (if newlines are different) next unless $data =~ m{<\?xpacket end=.(w|r).\?>($/|$)}; } elsif ($data =~ /^%%?(\w+): ?(.*)/s and $$tagTablePtr{$1}) { my ($tag, $val) = ($1, $2); # only allow 'ImageData' to have single leading '%' next unless $data =~ /^%%/ or $1 eq 'ImageData'; # decode comment string (reading continuation lines if necessary) $val = DecodeComment($val, $raf, \@lines); $exifTool->HandleTag($tagTablePtr, $tag, $val); next; } elsif ($embedded and $data =~ /^%AI12_CompressedData/) { # the rest of the file is compressed unless (eval 'require Compress::Zlib') { $exifTool->Warn('Install Compress::Zlib to extract compressed embedded data'); last; } # seek back to find the start of the compressed data in the file my $tlen = length($data) + @lines; $tlen += length $_ foreach @lines; my $backTo = $raf->Tell() - $tlen - 64; $backTo = 0 if $backTo < 0; last unless $raf->Seek($backTo, 0) and $raf->Read($data, 2048); last unless $data =~ s/.*?%AI12_CompressedData//; my $inflate = Compress::Zlib::inflateInit(); $inflate or $exifTool->Warn('Error initializing inflate'), last; # generate a PS-like file in memory from the compressed data my $verbose = $exifTool->Options('Verbose'); my $out = $exifTool->Options('TextOut'); if ($verbose > 1) { $exifTool->VerboseDir('AI12_CompressedData (first 4kB)'); $exifTool->VerboseDump(\$data); } # remove header if it exists (Windows AI files only) $data =~ s/^.{0,256}EndData[\x0d\x0a]+//s; my $val; for (;;) { my ($v2, $stat) = $inflate->inflate($data); $stat == Compress::Zlib::Z_STREAM_END() and $val .= $v2, last; $stat != Compress::Zlib::Z_OK() and undef($val), last; if (defined $val) { $val .= $v2; } elsif ($v2 =~ /^%!PS/) { $val = $v2; } else { # add postscript header (for file recognition) if it doesn't exist $val = "%!PS-Adobe-3.0$/" . $v2; } $raf->Read($data, 65536) or last; } defined $val or $exifTool->Warn('Error inflating AI compressed data'), last; if ($verbose > 1) { $exifTool->VerboseDir('Uncompressed AI12 Data'); $exifTool->VerboseDump(\$val); } # extract information from embedded images in the uncompressed data $val = # add PS header in case it needs one ProcessPS($exifTool, { DataPt => \$val }); last; } else { next; } # extract information from buffered data my %dirInfo = ( DataPt => \$buff, DataLen => length $buff, DirStart => 0, DirLen => length $buff, Parent => 'PostScript', ); my $subTablePtr = GetTagTable("Image::ExifTool::${mode}::Main"); unless ($exifTool->ProcessDirectory(\%dirInfo, $subTablePtr)) { $exifTool->Warn("Error processing $mode information in PostScript file"); } undef $buff; undef $mode; } $/ = $oldsep; # restore original separator $mode = 'Document' if $endDoc and not $mode; $mode and PSErr($exifTool, "unterminated $mode data"); return 1; } #------------------------------------------------------------------------------ # Extract information from EPS file # Inputs: 0) ExifTool object reference, 1) dirInfo reference # Returns: 1 if this was a valid PostScript file sub ProcessEPS($$) { return ProcessPS($_[0],$_[1]); } 1; # end __END__ =head1 NAME Image::ExifTool::PostScript - Read PostScript meta information =head1 SYNOPSIS This module is loaded automatically by Image::ExifTool when required. =head1 DESCRIPTION This code reads meta information from EPS (Encapsulated PostScript), PS (PostScript) and AI (Adobe Illustrator) files. =head1 AUTHOR Copyright 2003-2009, Phil Harvey (phil at owl.phy.queensu.ca) This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 REFERENCES =over 4 =item L =item L =item L =back =head1 SEE ALSO L, L =cut