1 ### File: LinkWalker.pm
\r
2 ### Author: Lennart Borgman
\r
3 ### All rights reserved
\r
5 ##########################################################
\r
7 ##########################################################
\r
8 package LWP::WalkerUA;
\r
9 require LWP::UserAgent;
\r
10 @ISA = qw(LWP::UserAgent);
\r
12 ### Mirror to another file (why???)
\r
15 my($self, $url, $file, $mirr_tmp) = @_;
\r
16 die "no mirr_tmp" unless defined $mirr_tmp;
\r
18 LWP::Debug::trace('()');
\r
19 my $request = new HTTP::Request('GET', $url);
\r
22 my($mtime) = (stat($file))[9];
\r
24 $request->header('If-Modified-Since' =>
\r
25 HTTP::Date::time2str($mtime));
\r
28 my $tmpfile = "$file-$$";
\r
30 my $response = $self->request($request, $tmpfile);
\r
31 if ($response->is_success) {
\r
33 my $file_length = (stat($tmpfile))[7];
\r
34 my($content_length) = $response->header('Content-length');
\r
36 if (defined $content_length and $file_length < $content_length) {
\r
38 die "Transfer truncated: " .
\r
39 "only $file_length out of $content_length bytes received\n";
\r
40 } elsif (defined $content_length and $file_length > $content_length) {
\r
42 die "Content-length mismatch: " .
\r
43 "expected $content_length bytes, got $file_length\n";
\r
47 # Some dosish systems fail to rename if the target exists
\r
48 chmod 0777, $mirr_tmp;
\r
51 rename($tmpfile, $mirr_tmp) or
\r
52 die "Cannot rename '$tmpfile' to '$mirr_tmp': $!\n";
\r
54 if (my $lm = $response->last_modified) {
\r
55 # make sure the file has the same last modification time
\r
56 utime $lm, $lm, $mirr_tmp;
\r
66 ##########################################################
\r
68 ##########################################################
\r
69 package HTML::WalkerParser;
\r
70 require HTML::ParserTagEnd;
\r
71 @ISA = qw(HTML::ParserTagEnd);
\r
73 use vars qw(%LINK_ELEMENT);
\r
75 # Elements that might contain links and the name of the link attribute
\r
78 body => 'background',
\r
81 img => [qw(src lowsrc usemap)], # 'lowsrc' is a Netscape invention
\r
84 'link' => 'href', # need quoting since link is a perl builtin
\r
86 applet => [qw(codebase code)],
\r
88 iframe => 'src', # Netscape 2.0 extention
\r
89 embed => 'src', # used in Netscape 2.0 for Shockwave and things like that
\r
110 sub maybecont($$) {
\r
113 return unless exists $MAYBECONT{$tag};
\r
114 return ($MAYBECONT{$tag} eq $att);
\r
118 my($class, $parsed_fh) = @_;
\r
119 my $self = $class->SUPER::new;
\r
120 $self->{parsed_fh} = $parsed_fh;
\r
130 ##########################################################
\r
132 ##########################################################
\r
133 package HTML::LinkWalker;
\r
137 use File::Copy qw();
\r
138 use File::Path qw();
\r
140 use HTML::Entities;
\r
144 ##########################################################
\r
146 ##########################################################
\r
148 my $m_ua_personality = "LinkWalker/0.9";
\r
150 my %m_is_container;
\r
155 my $m_subMirrorAction;
\r
158 #############################
\r
159 ### Collecting info
\r
160 #############################
\r
161 my %m_CheckedLinks;
\r
164 sub tell_bad_link($$$$$) {
\r
170 $file = "START" unless defined $file;
\r
171 $lnum = "(start)" unless defined $lnum;
\r
172 my $longMsg = "<<$what>>";
\r
173 my $shortMsg = $what;
\r
174 if (defined $link) {
\r
175 my @lines = split("\\s+", $line);
\r
176 my $disp_line = join("\n\t\t ", @lines);
\r
177 $longMsg .= ",\n\t\tlink=$link\n\t\t$disp_line";
\r
179 my @msg = ($shortMsg, $longMsg);
\r
180 $m_CheckedLinks{$file}->{ERR}->{$lnum} = \@msg;
\r
181 &$m_subReport("\t* Error * " . $what . "\n");
\r
185 #############################
\r
187 #############################
\r
189 sub get_contenttype($) {
\r
190 my $response = shift;
\r
191 my @rh = $response->header("Content-Type");
\r
194 if ((my $iPos = index($r, ";")) > -1) {
\r
195 $c = substr($r, 0, $iPos);
\r
200 sub is_linked_contenttype($) {
\r
201 my $response = shift;
\r
202 return (get_contenttype($response) eq "text/html");
\r
205 sub ending_is_container($) {
\r
206 my $link_addr = shift;
\r
207 $link_addr =~ s!#.*$!!;
\r
208 $link_addr =~ s!\?.*$!!;
\r
209 return (($link_addr =~ m!\.s?html?$!i) ? 1 : 0);
\r
215 sub mk_mirror_name($) {
\r
216 my $orig_name = shift;
\r
217 $orig_name =~ tr!\\!/!;
\r
218 my $mirr_name = $orig_name;
\r
219 my ($orig_host) = ($orig_name =~ m!(^https?://[^/]*)!i);
\r
220 if (defined $orig_host) {
\r
221 my $host = $orig_host;
\r
224 $mirr_name =~ s!^$orig_host!$host!;
\r
225 if (substr($mirr_name, -1) eq "/") { $mirr_name .= "default.html"; }
\r
227 die "Can't find host in $orig_name\n";
\r
229 my $mirr_full = sMirrorRoot() . $mirr_name;
\r
231 my $sExt = $mirr_name; $sExt =~ s!.*\.([^\.]*$)!$1!;
\r
232 $mirr_full = sMirrorRoot() . "temp.$sExt";
\r
234 my $mirr_fold = $mirr_full;
\r
235 $mirr_fold =~ s![^/]*$!!;
\r
236 File::Path::mkpath($mirr_fold, 0, 0777);
\r
240 #############################
\r
242 #############################
\r
243 sub is_outside($) {
\r
244 my $uq_link_addr = shift;
\r
245 if (!exists $m_is_outside{$uq_link_addr}) {
\r
246 $m_is_outside{$uq_link_addr} = test_is_outside($uq_link_addr, \@m_sLinkRoots);
\r
248 return $m_is_outside{$uq_link_addr};
\r
250 sub set_is_container($$) {
\r
251 my $uq_link_addr = shift;
\r
252 return if exists $m_is_container{$uq_link_addr};
\r
253 $m_is_container{$uq_link_addr} = shift;
\r
255 sub is_outside_container($) {
\r
256 my $uq_link_addr = shift;
\r
257 if (exists $m_is_container{$uq_link_addr}) {
\r
258 if ($m_is_container{$uq_link_addr}) {
\r
259 return is_outside($uq_link_addr);
\r
263 sub test_is_outside($$) {
\r
264 my $uq_link_addr = shift;
\r
265 my $link_roots = shift;
\r
266 if (defined $link_roots) {
\r
268 for my $link_root (@$link_roots) {
\r
269 if (substr($uq_link_addr, 0, length($link_root)) eq $link_root) {
\r
279 ##########################################################
\r
281 ##########################################################
\r
285 sub HTML::WalkerParser::declaration {
\r
286 my($self, $decl) = @_;
\r
287 return unless defined $self->{parsed_fh};
\r
288 my $fh = $self->{parsed_fh};
\r
289 print $fh "<!" . $decl . ">";
\r
292 sub HTML::WalkerParser::start {
\r
293 my($self, $tag, $attr, $ended) = @_;
\r
294 &$m_start_cb($tag, $attr);
\r
295 return unless defined $self->{parsed_fh};
\r
297 for my $k (keys %$attr) {
\r
298 my $encoded = encode_entities($$attr{$k});
\r
299 $t .= qq( $k="$encoded");
\r
306 my $fh = $self->{parsed_fh};
\r
309 sub HTML::WalkerParser::end {
\r
310 my ($self, $tag) = @_;
\r
311 return unless defined $self->{parsed_fh};
\r
312 my $fh = $self->{parsed_fh};
\r
313 print $fh "</" . $tag . ">";
\r
315 sub HTML::WalkerParser::text {
\r
316 my ($self, $txt) = @_;
\r
317 return unless defined $self->{parsed_fh};
\r
318 my $fh = $self->{parsed_fh};
\r
321 sub HTML::WalkerParser::comment {
\r
322 my($self, $comment) = @_;
\r
323 return unless defined $self->{parsed_fh};
\r
324 my $fh = $self->{parsed_fh};
\r
325 print $fh "<!--" . $comment . "-->";
\r
331 ### Main parsing routine
\r
333 sub parse_file($$$$$$$$$) {
\r
334 my ($file_name, $parsed_fh, $uq_link_addr, $link_roots,
\r
335 $ref_links, $ref_anchs, $ref_lines, $ref_tagname, $ref_attname) = @_;
\r
337 if (-d $file_name) {
\r
338 $file_name = PathSubs::uniq_dir($file_name) . "default.html";
\r
339 $uq_link_addr .= "/" unless substr($uq_link_addr, -1) eq "/";
\r
340 $uq_link_addr .= "default.html";
\r
341 &$m_subReport("dir => $file_name\n");
\r
343 $fh = new IO::File($file_name);
\r
344 die "Can't read $file_name: $!\n" unless defined $fh;
\r
348 my $uq_link_fold = $uq_link_addr; $uq_link_fold =~ s![^/]*$!!;
\r
352 my ($tag, $attr_hash) = @_;
\r
353 for my $k (keys %$attr_hash) {
\r
354 if (($k eq "id") || ($k eq "name")) {
\r
355 my $v = $$attr_hash{$k};
\r
356 $$ref_anchs{$v} = $n;
\r
357 $$ref_lines{$n} = $line;
\r
358 } elsif (exists $LINKATTRIBS{$k}) {
\r
359 my $v = $$attr_hash{$k};
\r
360 next if $v =~ m!^javascript:!;
\r
361 next if $v =~ m!^ftp://!;
\r
362 next if $v =~ m!^mailto://!;
\r
363 if ($tag eq "base") { $base_href = $v if $k eq "href"; next; }
\r
364 my $v_abs; my $v_rel;
\r
365 my $v_is_abs = PathSubs::is_abs_path($v);
\r
368 $v_rel = PathSubs::mk_relative_link($uq_link_addr, $v_abs);
\r
371 if (defined $base_href) {
\r
372 $v_abs = PathSubs::mk_abs_link($base_href, $v);
\r
374 if (substr($v_rel, 0, 1) ne "#") {
\r
375 $v_abs = $uq_link_fold . $v_rel;
\r
377 $v_abs = $uq_link_addr . $v_rel;
\r
379 $v_abs = PathSubs::resolve_dotdot($v_abs);
\r
382 next if exists $m_CheckedLinks{$v_abs};
\r
383 if (is_outside($v_abs)) {
\r
385 if (ending_is_container($v_abs)) {
\r
386 $m_CheckedLinks{$v_abs} = {};
\r
387 tell_bad_link("Outside relative link ($v_rel)",
\r
388 $uq_link_addr, $n, $v, $line);
\r
391 ### Skip outside absolute links
\r
392 ### Could be things like banners etc...
\r
395 $$ref_links{$v_rel} = $n;
\r
396 $$ref_lines{$n} = $line;
\r
397 if (substr($v_rel, 0, 1) ne "#") {
\r
398 my $v_rel_name = $v_rel;
\r
399 $v_rel_name =~ s!#.*$!!;
\r
400 $v_rel_name =~ s!\?.*$!!;
\r
401 $$ref_tagname{$v_rel_name} = $tag;
\r
402 $$ref_attname{$v_rel_name} = $k;
\r
404 if ($v_is_abs && ($v_rel ne $v)) { $$attr_hash{$k} = $v_rel; }
\r
409 $m_start_cb = $start_cb;
\r
410 my $p = HTML::WalkerParser->new($parsed_fh);
\r
411 while ($line = <$fh>) {
\r
420 ##########################################################
\r
422 ##########################################################
\r
423 sub walk_link($$;$$$$) {
\r
424 die "$#_" unless ($#_ == 1 || $#_ == 5);
\r
425 my $link_fold = shift;
\r
426 my $link_file = shift;
\r
427 my $parent_url = shift;
\r
428 my $parent_lnum = shift;
\r
429 my $parent_link = shift;
\r
430 my $parent_line = shift;
\r
432 my $link_addr = $link_fold . $link_file;
\r
434 my $is_file = ($link_addr !~ m!^https?://!i);
\r
436 $uq_link_addr = PathSubs::uniq_file($link_addr);
\r
438 $uq_link_addr = PathSubs::resolve_dotdot($link_addr);
\r
440 return if exists $m_CheckedLinks{$uq_link_addr};
\r
441 return if exists $m_MissedLinks{$uq_link_addr};
\r
442 $m_CheckedLinks{$uq_link_addr} = {};
\r
443 my $link_is_container = ending_is_container($uq_link_addr);
\r
444 if ($link_is_container) {
\r
445 set_is_container($uq_link_addr, 1);
\r
446 return if is_outside($uq_link_addr);
\r
448 return if $m_bOnlyCont;
\r
455 if (!-r $uq_link_addr) {
\r
456 tell_bad_link("Can't read file ($uq_link_addr)",
\r
457 $parent_url, $parent_lnum, $parent_link, $parent_line);
\r
458 $m_MissedLinks{$uq_link_addr} = 1;
\r
461 $file_name = $uq_link_addr;
\r
463 $file_name = mk_mirror_name($uq_link_addr);
\r
464 if (!defined $ua) {
\r
465 $ua = new LWP::UserAgent;
\r
466 $ua->agent($m_ua_personality);
\r
470 $response = $ua->mirror($uq_link_addr, $file_name);
\r
471 &$m_subMirrorAction($uq_link_addr, $file_name, $response);
\r
473 my $request = new HTTP::Request('GET', $uq_link_addr);
\r
474 $response = $ua->request($request, $file_name);
\r
476 #dump_response($response); exit;
\r
477 if ($response->code != 304) {
\r
478 if (!$response->is_success) {
\r
479 tell_bad_link($response->status_line . " ($uq_link_addr)",
\r
480 $parent_url, $parent_lnum, $parent_link, $parent_line);
\r
481 $m_MissedLinks{$uq_link_addr} = 1;
\r
484 $bDoRewrite = $m_bMirror;
\r
485 $contenttype = get_contenttype($response);
\r
486 $link_is_container = is_linked_contenttype($response);
\r
488 if ($uq_link_addr ne $response->base) {
\r
490 my $base_file = mk_mirror_name($response->base);
\r
491 if (!File::Copy::copy($file_name, $base_file)) {
\r
492 die "Can't copy($file_name, $base_file): $!\n";
\r
494 if (my $lm = $response->last_modified) { utime $lm, $lm, $base_file; }
\r
495 $file_name = $base_file;
\r
497 $uq_link_addr = $response->base;
\r
500 ### Test again, could be new info from net!
\r
501 if ($link_is_container) {
\r
502 set_is_container($uq_link_addr, 1);
\r
503 return if is_outside($uq_link_addr);
\r
505 return if $m_bOnlyCont;
\r
508 &$m_subReport("$uq_link_addr ...");
\r
517 my $file_to_parse = $file_name;
\r
519 $parsed_file = $file_to_parse . "-p$$";
\r
520 &$m_subReport(" <<GET");
\r
521 $parsed_fh = new IO::File("> $parsed_file");
\r
522 die "Can't create $parsed_file: $!\n" unless defined $parsed_fh;
\r
523 print $parsed_fh "<!-- parsed version -->\n";
\r
525 &$m_subReport("\n");
\r
526 parse_file($file_to_parse, $parsed_fh, $uq_link_addr,
\r
528 \%links, \%anchs, \%lines, \%tagname, \%attname);
\r
529 if (defined $parsed_fh) {
\r
530 $parsed_fh->close();
\r
531 if (-e $file_name) { unlink $file_name or die "Can't unlink $file_name: $!"; }
\r
532 rename($parsed_file, $file_name) or die "Can't rename($parsed_file, $file_name): $!\n";
\r
533 if (my $lm = $response->last_modified) { utime $lm, $lm, $file_name; }
\r
536 if ($link_is_container) { return if is_outside($uq_link_addr); }
\r
538 $m_CheckedLinks{$uq_link_addr}->{ANC} = \%anchs;
\r
541 $file_dir = $uq_link_addr;
\r
542 $file_dir =~ s![^/]*$!!;
\r
545 my $container_folder = $uq_link_addr; $container_folder =~ s![^/]*$!!;
\r
546 &$m_subAction($uq_link_addr, $file_name, $contenttype);
\r
547 for my $link (sort keys %links) {
\r
548 # Next line is for onclick lines in prepared docs
\r
549 next if ($link eq "#");
\r
550 my $lnum = $links{$link};
\r
551 my $line = $lines{$lnum};
\r
553 tell_bad_link("Empty link", $uq_link_addr, $lnum, $link, $line);
\r
556 if ($link =~ m!(.*)\?!) { $link = $1; }
\r
558 if ($link =~ m!(.*)#(.*)!) { $link = $1; $anchor = $2; }
\r
560 if (!exists $anchs{$anchor}) {
\r
561 tell_bad_link("Anchor not found ($anchor)", $uq_link_addr, $lnum, $link, $line);
\r
568 if ($link =~ m!^https?://!i) {
\r
571 $uq_sublink = $link;
\r
575 $sub_fold = $file_dir;
\r
576 $uq_sublink = PathSubs::uniq_file($sub_fold . $sub_file);
\r
578 $sub_fold = $container_folder;
\r
579 $uq_sublink = $sub_fold . $sub_file;
\r
582 next if (exists $m_CheckedLinks{$uq_sublink});
\r
583 if (defined $anchor) {
\r
584 $m_CheckedLinks{$uq_link_addr}->{EXTANC}->{$uq_sublink} =
\r
585 { ANC=> $anchor, LINE=>$line, LNUM=>$lnum};
\r
587 if ($m_bOnlyCont) {
\r
588 die "link=$link\tattr=$tagname{$link}\n" unless exists $tagname{$link};
\r
589 next unless maybecont($tagname{$link}, $attname{$link});
\r
591 if (is_outside($uq_link_addr)) {
\r
592 if (maybecont($tagname{$link}, $attname{$link}) ) {
\r
596 walk_link($sub_fold, $sub_file, $uq_link_addr, $lnum, $link, $line);
\r
603 ############################################
\r
604 ### Some more checks!
\r
605 ############################################
\r
606 sub check_external_anchors() {
\r
607 &$m_subReport("\nChecking external anchors...\n");
\r
608 for my $f (sort keys %m_CheckedLinks) {
\r
609 my $fnode = $m_CheckedLinks{$f};
\r
610 if (exists ${$fnode}{"EXTANC"}) {
\r
611 my $extanc_hash = ${$fnode}{"EXTANC"};
\r
612 for my $fx (keys %$extanc_hash) {
\r
613 next unless (exists $m_CheckedLinks{$fx});
\r
614 my $ea_hash = ${$extanc_hash}{$fx};
\r
615 my $ea = ${$ea_hash}{ANC};
\r
616 my $fxnode = $m_CheckedLinks{$fx};
\r
617 my $fx_anc_hash = ${$fxnode}{"ANC"};
\r
618 if (!exists ${$fx_anc_hash}{$ea}) {
\r
619 my $line = ${$ea_hash}{LINE};
\r
620 my $lnum = ${$ea_hash}{LNUM};
\r
621 &$m_subReport("From $f\n");
\r
622 tell_bad_link("Ext anchor not found ($fx#$ea)",
\r
623 $f, $lnum, "$fx#$ea", $line);
\r
628 } # check_external_anchors
\r
632 #############################
\r
634 #############################
\r
635 sub report_errors($$) {
\r
638 my $errors_reported;
\r
640 for my $f (sort keys %m_CheckedLinks) {
\r
641 my $fnode = $m_CheckedLinks{$f};
\r
642 if (exists ${$fnode}{ERR}) {
\r
645 if (!defined $errors_reported) {
\r
646 $errors_reported = 1;
\r
647 &$m_subReport("\n\n*********** Summary ERRORS and WARNINGS **********\n");
\r
649 &$m_subReport("$f\n");
\r
650 my $err_hash = ${$fnode}{ERR};
\r
651 for my $e (sort keys %$err_hash) {
\r
652 my $refE = ${$err_hash}{$e};
\r
653 &$m_subReport("\t" . ${$refE}[0] . "\n");
\r
657 undef $errors_reported;
\r
659 for my $f (sort keys %m_CheckedLinks) {
\r
660 my $fnode = $m_CheckedLinks{$f};
\r
661 if (exists ${$fnode}{ERR}) {
\r
662 if (!defined $errors_reported) {
\r
663 $errors_reported = 1;
\r
664 &$m_subReport("\n\n*********** Detailed ERRORS and WARNINGS **********\n");
\r
666 &$m_subReport("$f\n");
\r
667 my $err_hash = ${$fnode}{ERR};
\r
668 for my $e (sort keys %$err_hash) {
\r
669 my $refE = ${$err_hash}{$e};
\r
670 &$m_subReport("\tat line $e: " . ${$refE}[1] . "\n");
\r
675 if ($errors_found) {
\r
676 die "\n*** There where errors ***\n";
\r
678 &$m_subReport("No errors found\n");
\r
682 sub dump_response($) {
\r
683 my $response = shift;
\r
684 &$m_subReport( $response->code . " " . $response->message . "\n");
\r
685 &$m_subReport( "****************************************\n");
\r
686 #&$m_subReport( $response->request . "\n");
\r
687 #&$m_subReport( "****************************************\n");
\r
688 #&$m_subReport( $response->previous . "\n");
\r
689 #&$m_subReport( "****************************************\n");
\r
690 &$m_subReport( " i=" . $response->is_info .
\r
691 ", s=" . $response->is_success .
\r
692 ", r=" . $response->is_redirect .
\r
693 ", e=" . $response->is_error . "\n");
\r
694 &$m_subReport( "****************************************\n");
\r
695 &$m_subReport( "content: " . $response->content . "\n");
\r
696 &$m_subReport( "****************************************\n");
\r
697 &$m_subReport( "base: " . $response->base . "\n");
\r
698 &$m_subReport( "****************************************\n");
\r
699 &$m_subReport( $response->as_string);
\r
700 &$m_subReport( "****************************************\n");
\r
701 &$m_subReport( $response->current_age . "\n");
\r
702 &$m_subReport( "****************************************\n");
\r
703 my @rh = $response->header("Content-Type");
\r
704 for my $r (@rh) { &$m_subReport( "ct: $r\n"); }
\r
705 &$m_subReport( "****************************************\n");
\r
709 #############################
\r
711 #############################
\r
712 sub sMirrorRoot() {
\r
714 $m_sMirrorRoot = PathSubs::get_temp_path() . "LinkWalker/" unless defined $m_sMirrorRoot;
\r
715 my $old = $m_sMirrorRoot;
\r
716 $m_sMirrorRoot = PathSubs::uniq_dir($val) if defined $val;
\r
721 my $old = $m_bMirror;
\r
722 $m_bMirror = $val if defined $val;
\r
726 sub subReporter(;$) {
\r
728 my $old = $m_subReport;
\r
729 $m_subReport = $val if defined $val;
\r
732 sub subAction(;$) {
\r
734 my $old = $m_subAction;
\r
735 $m_subAction = $val if defined $val;
\r
738 sub bOnlyCont(;$) {
\r
740 my $old = $m_bOnlyCont;
\r
741 $m_bOnlyCont = $val if defined $val;
\r
744 sub ua_personality(;$) {
\r
746 my $old = $m_ua_personality;
\r
747 $m_ua_personality = $val if defined $val;
\r
751 sub clear_roots() { @m_sLinkRoots = (); }
\r
752 sub get_roots() { return \@m_sLinkRoots; }
\r
753 sub add_root($) { push @m_sLinkRoots, shift; }
\r
754 sub add_files_root($) {
\r
757 my ($host) = ($file =~ m!(^https?://[^/]*)!i);
\r
758 if (defined $host) {
\r
759 $default_root = $file;
\r
761 die "Can't find $file\n" unless -e $file;
\r
762 $default_root = PathSubs::uniq_file($file);
\r
764 $default_root =~ s![^/]*$!!;
\r
765 add_root($default_root);
\r
768 ### Default actions
\r
770 $m_subReport = \&default_sub;
\r
771 $m_subAction = \&default_sub;
\r
772 $m_subMirrorAction = \&default_sub;
\r