On 22 Aug 2003, Jeremy Newman wrote: [...]
- No <html><head><body> tags. Just the content. ie, everything that
would be between the <body> tags.
I have a Perl script that does that part. Should be easy to extend to also extract the title... Here goes as a starting point. Maybe I'll work a bit more on it tomorrow but if anyone feels like hacking on it, feel free!
#!/usr/bin/perl -w use strict; use File::Copy;
my $filename=$ARGV[0]; print " $filename\n";
#FIXME:assuming that because there is a .bak file, this is what we want is #probably flawed. Or is it??? if (! -e "$filename.bak") { if (!copy("$filename","$filename.bak")) { print STDERR "error: unable to make a backup of $filename:\n"; print STDERR " $!\n"; return; } } if (!open(FILEI,"$filename.bak")) { print STDERR "error: unable to open $filename.bak for reading:\n"; print STDERR " $!\n"; return; } if (!open(FILEO,">$filename")) { print STDERR "error: unable to open $filename for writing:\n"; print STDERR " $!\n"; return; }
my $line; while ($line=<FILEI>) { if ($line =~ s/<body[^>]*>//i) { print "matched <body>: $line"; last; } elsif ($line =~ s/<body[^>]*$//i) { print "matched <body: $line"; while ($line=<FILEI>) { print "looking for > $line"; if ($line =~ s/^[^>]*>//i) { last; } } last; } }
print FILEO $line; while ($line=<FILEI>) { if ($line =~ s/</body//i) { print FILEO $line; last; } print FILEO $line; }
close FILEI; close FILEO;
exit 0;