#!/usr/local/bin/perl # # weblint - pick fluff off WWW pages (html). # # Copyright (C) 1994, 1995, 1996, 1997 Neil Bowers. All rights reserved. # # See README for additional blurb. # Bugs, comments, suggestions welcome: neilb@cre.canon.co.uk # # Latest version is available as: # ftp://ftp.cre.canon.co.uk/pub/weblint/weblint.tar.gz # $VERSION = '1.019'; ($PROGRAM = $0) =~ s@.*/@@; @TMPDIR_OPTIONS = ('/usr/tmp', '/tmp', '/var/tmp', '/temp'); $TMPDIR = &PickTmpdir(@TMPDIR_OPTIONS); $SITE_DIR = ''; $USER_RCFILE = $ENV{'WEBLINTRC'} || "$ENV{'HOME'}/.weblintrc"; $SITE_RCFILE = $SITE_DIR.'/global.weblintrc' if $SITE_DIR; #------------------------------------------------------------------------ # $version - the string which is displayed with -v or -version #------------------------------------------------------------------------ $versionString=< : HTML extension to include (supported: Java, Netscape) To check one or more HTML files, run weblint thusly: weblint file1.html [... fileN.html] If a file is in fact a directory, weblint will recurse, checking all files. To include the Netscape extensions: weblint -x Netscape file.html EofUsage #------------------------------------------------------------------------ # $todo - string displayed with the -todo switch #------------------------------------------------------------------------ $todo=< 0; # escape the `-' command-line switch (for stdin), so NGetOpt don't mess wi' it grep(s/^-$/\tstdin\t/, @ARGV); &NGetOpt(@options) || die "use -U switch to display usage statement\n"; # put back the `-' command-line switch, if it was there grep(s/^\tstdin\t$/-/, @ARGV); die "$versionString\n" if $opt_v || $opt_version; die "$usage" if $opt_u || $opt_help; &ReadDefaults(); # Read configuration if ($opt_f) { &ReadConfigFile($opt_f); } elsif (-f $USER_RCFILE) { &ReadConfigFile($USER_RCFILE); } elsif (! $opt_noglobals && defined $SITE_RCFILE && -f $SITE_RCFILE) { &ReadConfigFile($SITE_RCFILE); } # must do this after reading their config file to see a valid url-get &PrintToDo() if $opt_todo; # pedantic command-line switch turns on all warnings except case checking if ($opt_pedantic) { foreach $warning (keys %enabled) { &enableWarning($warning, 1); } &enableWarning('lower-case', 0); &enableWarning('upper-case', 0); &enableWarning('bad-link', 0); &enableWarning('require-doctype', 0); } &AddExtension("\L$opt_x") if $opt_x; $variable{'message-style'} = 'short' if $opt_s; $variable{'message-style'} = 'terse' if $opt_t; $variable{'url-get'} = $opt_urlget if $opt_urlget; *WARNING = *STDERR if $opt_stderr; &ListWarnings() if $opt_warnings; ($fileExtensions = $variable{'file-extensions'}) =~ s/,\s*/\|/g; # WARNING file handle is default select(WARNING); $opt_l = 1 if $ignore{'SYMLINKS'}; # -d to disable warnings if ($opt_d) { for (split(/,/,$opt_d)) { &enableWarning($_, 0); } } # -e to enable warnings if ($opt_e) { for (split(/,/,$opt_e)) { &enableWarning($_, 1) || next; } } # -i option to ignore case in element tags if ($opt_i) { $enabled{'lower-case'} = $enabled{'upper-case'} = 0; } if (defined $variable{'directory-index'}) { @dirIndices = split(/\s*,\s*/, $variable{'directory-index'}); } $argc = int(@ARGV); while (@ARGV > 0) { $arg = shift(@ARGV); &CheckURL($arg), next if $arg =~ m!^(http|gopher|ftp)://!; &find($arg), next if -d $arg; if ($opt_l && -l $arg && $argc == 1) { warn "$PROGRAM: $arg is a symlink, but I'll check it anyway\n"; } &WebLint($arg), next if (-f $arg && -r $arg) || $arg eq '-'; print "$PROGRAM: could not read $arg: $!\n"; } exit $exit_status; #======================================================================== # Function: WebLint # Purpose: This is the high-level interface to the checker. It takes # a file and checks for fluff. #======================================================================== sub WebLint { local($filename,$relpath) = @_; local(@tags) = (); local($tagRE) = (''); local(@taglines) = (); local(@orphans) = (); local(@orphanlines) = (); local(%seenPage); local(%seenTag); local($lastTAG) = ''; local(%whined); local(*PAGE); local($line) = (''); local($id, $ID); local($tag, $tagNum); local($closing); local($tail); local(%args); local($arg); local($rest); local($lastNonTag); local(@notSeen); local($seenMailtoLink) = (0); local($matched); local($matchedLine); local($novalue); local($heading); local($headingLine); local($commentline); local($_); if ($filename eq '-') { *PAGE = *STDIN; $filename = 'stdin'; } else { return if defined $seenPage{$filename}; if (-d $filename) { print "$PROGRAM: $filename is a directory.\n"; $exit_status = 0; return; } $seenPage{$filename}++; open(PAGE,"<$filename") || do { print "$PROGRAM: could not read file $filename: $!\n"; $exit_status = 0; return; }; $filename = $relpath if defined $relpath; } undef $heading; $tagNum = 0; READLINE: while () { $line .= $_; # $line =~ s/\n/ /g; while ($line =~ //o) { $nl = $lnt = $'; $nl =~ s/[^\n]//go; if ('PRE' =~ /^($tagRE)$/) { &whine($. - length($nl), 'meta-in-pre', '>', '>'); } else { &whine($. - length($nl), 'literal-metacharacter', '>', '>'); } } } #-------------------------------------------------------- #== SGML comment: #-------------------------------------------------------- if ($tail =~ /^!--/o) { $commentline = $. unless defined $commentline; # push lastNonTag onto word list for spell checking $ct = $'; next READLINE unless $ct =~ /--\s*>/o; undef $commentline; $comment = $`; $line = $'; # markup embedded in comment can confuse some (most? :-) browsers &whine($., 'markup-in-comment') if $comment =~ /<\s*[^>]+>/o; next; } undef $commentline; next READLINE unless $tail =~ /^(\s*)([^>]*)>/o; &whine($., 'leading-whitespace', $2) if $1 ne ''; $line = $'; ($tag = $2) =~ s/\n/ /go; $id = $tag; &whine($., 'unknown-element', $id),next if $id =~ /^\s*$/o; # push lastNonTag onto word list for spell checking undef $tail; undef $closing; #-- is ignored for now. $seenTag{'DOCTYPE'}=1,next if $id =~ /^!doctype/io; if (!$whined{'require-doctype'} && !$seenTag{'DOCTYPE'}) { &whine($., 'require-doctype'); $whined{'require-doctype'} = 1; } $closing = 0; if ($id =~ m@^/@o) { $id =~ s@^/@@o; $ID = "\U$id"; $closing = 1; } &CheckAttributes(); $TAG = ($closing ? '/' : '').$ID; if (defined $mustFollow{$TAG}) { $ok = 0; foreach $pre (split(/\|/, $mustFollow{$TAG})) { ($ok=1),last if $pre eq $lastTAG; } if (!$ok || $lastNonTag !~ /^\s*$/o) { &whine($., 'must-follow', $TAG, $mustFollow{$TAG}); } } #-- catch empty container elements if ($closing && $ID eq $lastTAG && $lastNonTag =~ /^\s*$/o && $tagNums[$#tagNums] == ($tagNum - 1) && $ID ne 'TEXTAREA' && $ID ne 'TD') { &whine($., 'empty-container', $ID); } #-- special case for empty optional container elements if (!$closing && @tags > 0 && $ID eq $tags[$#tags] && $lastTAG eq $ID && $ID =~ /^($maybePaired)$/ && $tagNums[$#tagNums] == ($tagNum - 1) && $lastNonTag =~ /^\s*$/o && $ID ne 'COL' && $ID ne 'COLGROUP') { pop @tags; # pop off the tag, don't care what it is $tline = pop @taglines; pop @tagNums; &whine($tline, 'empty-container', $ID); $tagRE = join('|',@tags); } #-- whine about unrecognized element, and do no more checks ---- if (!defined $validAttributes{$ID}) { if (defined $netscapeAttributes{$ID} || defined $msAttributes{$ID}) { &whine($., 'extension-markup', ($closing ? "/$id" : "$id")); } else { &whine($., 'unknown-element', ($closing ? "/$id" : "$id")); } next; } if ($closing == 0 && defined $requiredAttributes{$ID}) { foreach $attr (split(/\|/,$requiredAttributes{$ID})) { unless (defined $args{$attr}) { &whine($., 'required-attribute', $attr, $id); } } } elsif ($closing == 0 && $id =~ /^($expectArgsRE)$/io) { &whine($., 'expected-attribute', $id) unless defined %args; } #-------------------------------------------------------- #== check case of tags #-------------------------------------------------------- &whine($., 'upper-case', $id) if $id ne $ID; &whine($., 'lower-case', $id) if $id ne "\L$id"; #-------------------------------------------------------- #== if tag id is /foo, then strip slash, and mark as a closer #-------------------------------------------------------- if ($closing) { if ($ID !~ /^($pairElements)$/o) { &whine($., 'illegal-closing', $id); } if ($ID eq 'A' && $lastNonTag =~ /^\s*here\s*$/io) { &whine($., 'here-anchor'); } if ($ID eq 'TITLE' && length($lastNonTag) > 64) { &whine($., 'title-length'); } #-- end of HEAD, did we see a TITLE in the HEAD element? ---- &whine($., 'require-head') if $ID eq 'HEAD' && !$seenTag{'TITLE'}; #-- was there a element in HEAD? &whine($., 'mailto-link') if $ID eq 'HEAD' && $seenMailtoLink == 0; } else { #-------------------------------------------------------- # do context checks. Should really be a state machine. #-------------------------------------------------------- if (defined $physicalFontElements{$ID}) { &whine($., 'physical-font', $ID, $physicalFontElements{$ID}); } if ($ID =~ /^H[1-6]$/o && 'A' =~ /^($tagRE)$/) { &whine($., 'heading-in-anchor', $ID); } if ($ID eq 'A' && defined $args{'HREF'}) { $target = $args{'HREF'}; if ($target =~ /([^:]+):\/\/([^\/]+)(.*)$/o || $target =~ /^(news|mailto):/o || $target =~ /^\//o) { } else { $target =~ s/#.*$//o; if ($target !~ /^\s*$/o && ! -f $target && ! -d $target) { &whine($., 'bad-link', $target); } } } if ($ID =~ /^H(\d)$/o) { if (defined $heading && $1 - $heading > 1) { &whine($., 'heading-order', $ID, $heading, $headingLine); } $heading = $1; $headingLine = $.; } #-- check for mailto: LINK ------------------------------ if ($ID eq 'LINK' && defined $args{'REV'} && defined $args{'HREF'} && $args{'REV'} =~ /^made$/io && $args{'HREF'} =~ /^mailto:/io) { $seenMailtoLink = 1; } if (defined $onceOnly{$ID}) { &whine($., 'once-only', $ID, $seenTag{$ID}) if $seenTag{$ID}; } $seenTag{$ID} = $.; &whine($., 'body-no-head') if $ID eq 'BODY' && !$seenTag{'HEAD'}; if ($ID ne 'HTML' && $ID ne '!DOCTYPE' && !$seenTag{'HTML'} && !$whined{'outer-html'}) { &whine($., 'html-outer'); $whined{'outer-html'} = 1; } #-- check for illegally nested elements --------------------- if ($ID =~ /^($nonNest)$/o && $ID =~ /^($tagRE)$/) { for ($i=$#tags; $tags[$i] ne $ID; --$i) { } &whine($., 'nested-element', $ID, $taglines[$i]); } unless (defined $validAttributes{$ID}) { &whine($., 'unknown-element', $ID); } #-------------------------------------------------------- # check for tags which have a required context #-------------------------------------------------------- if (defined $requiredContext{$ID}) { $ok = 0; foreach $context (split(/\|/, $requiredContext{$ID})) { ($ok=1),last if $context =~ /^($tagRE)$/; } unless ($ok) { &whine($., 'required-context', $ID, $requiredContext{$ID}); } } #-------------------------------------------------------- # check for tags which can only appear in the HEAD element #-------------------------------------------------------- if ($ID =~ /^($headTagsRE)$/o && 'HEAD' !~ /^($tagRE)$/) { &whine($., 'head-element', $ID); } if (! defined $okInHead{$ID} && 'HEAD' =~ /^($tagRE)$/) { &whine($., 'non-head-element', $ID); } #-------------------------------------------------------- # check for tags which have been deprecated (now obsolete) #-------------------------------------------------------- &whine($., 'obsolete', $ID) if $ID =~ /^($obsoleteTags)$/o; } #-------------------------------------------------------- #== was tag of type ... ? #== welcome to kludgeville, population seems to be on the increase! #-------------------------------------------------------- if ($ID =~ /^($pairElements)$/o) { if (!$closing && @tags > 0 && $ID eq $tags[$#tags] && $ID =~ /^($maybePaired)$/o) { pop @tags; pop @tagNums; pop @taglines; $tagRE = join('|',@tags); } if ($closing) { # trailing whitespace in content of container element if ($lastNonTag =~ /\S\s+$/o && $ID =~ /^($cuddleContainers)$/o) { &whine($., 'container-whitespace', 'trailing', $ID); } #-- if we have a closing tag, and the tag(s) on top of the stack #-- are optional closing tag elements, pop tag off the stack, #-- unless it matches the current closing tag if (@tags > 0 && $tags[$#tags] ne $ID && $tags[$#tags] =~ /^($maybePaired)$/o && $lastNonTag =~ /^\s*$/o && $tagNums[$#tagNums] == ($tagNum - 1) && $ID ne 'TD') { $tline = $taglines[$#taglines]; &whine($tline, 'empty-container', $tags[$#tags]); } while (@tags > 0 && $tags[$#tags] ne $ID && $tags[$#tags] =~ /^($maybePaired)$/o) { pop @tags; pop @tagNums; pop @taglines; } $tagRE = join('|',@tags); } else { # leading whitespace in content of container element if ($line =~ /^\s+/o && $ID =~ /^($cuddleContainers)$/o) { &whine($., 'container-whitespace', 'leading', $ID); } } if ($closing && $tags[$#tags] eq $ID) { &PopEndTag(); } elsif ($closing && $tags[$#tags] ne $ID) { #-- closing tag does not match opening tag on top of stack if ($ID =~ /^($tagRE)$/) { # If we saw , , or , then we try # and resolve anything inbetween on the tag stack if ($ID =~ /^(HTML|HEAD|BODY)$/o) { while ($tags[$#tags] ne $ID) { $ttag = pop @tags; pop @tagNums; $ttagline = pop @taglines; if ($ttag !~ /^($maybePaired)$/o) { &whine($., 'unclosed-element', $ttag, $ttagline); } #-- does top of stack match top of orphans stack? -- while (@orphans > 0 && @tags > 0 && $orphans[$#orphans] eq $tags[$#tags]) { pop @orphans; pop @orphanlines; pop @tags; pop @tagNums; pop @taglines; } } #-- pop off the HTML, HEAD, or BODY tag ------------ pop @tags; pop @tagNums; pop @taglines; $tagRE = join('|',@tags); } else { #-- matched opening tag lower down on stack push(@orphans, $ID); push(@orphanlines, $.); } } else { if ($ID =~ /^H[1-6]$/o && $tags[$#tags] =~ /^H[1-6]$/o) { &whine($., 'heading-mismatch', $tags[$#tags], $ID); &PopEndTag(); } else { &whine($., 'mis-match', $ID); } } } else { push(@tags,$ID); $tagRE = join('|',@tags); push(@tagNums,$tagNum); push(@taglines,$.); } } #-------------------------------------------------------- #== inline images (IMG) should have an ALT argument :-) #-------------------------------------------------------- &whine($., 'img-alt') if ($ID eq 'IMG' && !defined $args{'ALT'} && !$closing); #-------------------------------------------------------- #== WIDTH & HEIGHT on inline images (IMG) can help browsers #-------------------------------------------------------- &whine($., 'img-size') if ($ID eq 'IMG' && !defined $args{'WIDTH'} && !defined $args{'HEIGHT'} && !$closing); } continue { ++$tagNum; $lastTAG = $TAG; } $lastNonTag = $line; } close PAGE; if (defined $commentline) { &whine($commentline, 'unclosed-comment'); return; } while (@tags > 0) { $tag = shift(@tags); shift(@tagNums); $line = shift(@taglines); if ($tag !~ /^($maybePaired)$/o) { &whine($., 'unclosed-element', $tag, $line); } } for (@expectedTags) { # if we haven't seen TITLE but have seen HEAD # then we'll have already whined about the lack of a TITLE element next if $_ eq 'TITLE' && !$seenTag{$_} && $seenTag{'HEAD'}; next if $_ eq 'BODY' && $seenTag{'FRAMESET'}; push(@notSeen,$_) unless $seenTag{$_}; } if (@notSeen > 0) { printf "%sexpected tag(s) not seen: @notSeen\n", ($opt_s ? "" : "$filename(-): "); $exit_status = 1; } } #======================================================================== # Function: CheckAttributes # Purpose: If the tag has attributes, check them for validity. #======================================================================== sub CheckAttributes { undef %args; if ($closing == 0 && $tag =~ m|^(\S+)\s+(.*)|) { ($id,$tail) = ($1,$2); $ID = "\U$id"; # don't worry, or warn, about attributes of unknown elements return unless defined $validAttributes{$ID}; $tail =~ s/\n/ /go; # check for odd number of quote characters ($quotes = $tail) =~ s/[^""]//go; &whine($., 'odd-quotes', $tag) if length($quotes) % 2 == 1; $novalue = 0; $valid = $validAttributes{$ID}; while ($tail =~ /^\s*([^=\s]+)\s*=\s*(.*)$/o # catch attributes like ISMAP for IMG, with no arg || ($tail =~ /^\s*(\S+)(.*)/o && ($novalue = 1))) { $arg = "\U$1"; $rest = $2; &whine($., 'unexpected-open', $tag) if $arg =~ / if (defined $args{$arg}) { &whine($., 'repeated-attribute', $arg, $id); } if ($novalue) { $args{$arg} = ''; $tail = $rest; } elsif ($rest =~ /^'([^'']+)'(.*)$/o) { &whine($., 'attribute-delimiter', $arg, $ID); $args{$arg} = $1; $tail = $2; } elsif ($rest =~ /^"([^""]*)"(.*)$/o || $rest =~ /^'([^'']*)'(.*)$/o) { $args{$arg} = $1; $tail = $2; } elsif ($rest =~ /^(\S+)(.*)$/o) { $attrValue = $1; $args{$arg} = $attrValue; $tail = $2; if ($attrValue =~ /[^-.A-Za-z0-9]/o) { &whine($., 'quote-attribute-value', $arg, $attrValue, $ID); } } else { $args{$arg} = $rest; $tail = ''; } $novalue = 0; } foreach $attr (keys %args) { if (defined $attributeFormat{$attr} && $args{$attr} !~ /^($attributeFormat{$attr})$/i) { &whine($., 'attribute-format', $attr, $id, $args{$attr}); } } &whine($., 'unexpected-open', $tag) if $tail =~ / # The associative array `enabled' is used as a gating # function, to suppress or enable each warning. Every # warning has an associated identifier, which is used to # refer to the warning, and as the index into the hash. #======================================================================== sub whine { local($line, $id, @argv) = @_; local($mstyle) = $variable{'message-style'}; return unless $enabled{$id}; @argv = @argv; $exit_status = 1; (print "$filename:$line:$id\n"), return if $mstyle eq 'terse'; (eval "print \"\$filename($line): $message{$id}\n\""), return if $mstyle eq 'lint'; (eval "print \"line $line: $message{$id}\n\""), return if $mstyle eq 'short'; die "Unknown message style `$mstyle'\n"; } #======================================================================== # Function: ReadConfigFile # Purpose: Read the specified configuration file. This is used to # the user's .weblintrc file, or the global system config # file, if the user doesn't have one. #======================================================================== sub ReadConfigFile { local($filename) = @_; local(*CONFIG); local($arglist); local($keyword, $value); local($_); open(CONFIG,"< $filename") || do { print WARNING "Unable to read config file `$filename': $!\n"; return; }; while () { chop; s/#.*$//; next if /^\s*$/o; #-- match keyword: process one or more argument ------------------- if (/^\s*(enable|disable|extension|ignore)\s+(.*)$/io) { $keyword = "\U$1"; $arglist = $2; while ($arglist =~ /^\s*(\S+)/o) { $value = "\L$1"; &enableWarning($1, 1) if $keyword eq 'ENABLE'; &enableWarning($1, 0) if $keyword eq 'DISABLE'; $ignore{"\U$1"} = 1 if $keyword eq 'IGNORE'; &AddExtension("\L$1") if $keyword eq 'EXTENSION'; $arglist = $'; } } elsif (/^\s*set\s+(\S+)\s*=\s*(.*)/o) { # setting a weblint variable if (defined $variable{$1}) { $variable{$1} = $2; } else { print WARNING "Unknown variable `$1' in configuration file\n"; } } elsif (/^\s*use\s*global\s*weblintrc/o) { if (-f $SITE_RCFILE) { &ReadConfigFile($SITE_RCFILE); } else { print WARNING "$PROGRAM: unable to read global config file\n"; next; } } else { print WARNING ("$PROGRAM: ignoring unknown sequence (\"$_\") ". "in config file $filename\n"); } } close CONFIG; } #======================================================================== # Function: enableWarning # Purpose: Takes a warning identifier and an integer (boolean) # flag which specifies whether the warning should be # enabled. #======================================================================== sub enableWarning { local($id, $enabled) = @_; if (! defined $enabled{$id}) { print WARNING "$PROGRAM: unknown warning identifier \"$id\"\n"; return 0; } $enabled{$id} = $enabled; # # ensure consistency: if you just enabled upper-case, # then we should make sure that lower-case is disabled # $enabled{'lower-case'} = 0 if $id eq 'upper-case'; $enabled{'upper-case'} = 0 if $id eq 'lower-case'; $enabled{'upper-case'} = $enabled{'lower-case'} = 0 if $id eq 'mixed-case'; return 1; } #======================================================================== # Function: AddExtension # Purpose: Extend the HTML understood. Currently supported extensions: # Netscape - the netscape extensions proposed by # Netscape Communications, Inc. See: # Microsoft - the extensions for Microsoft Internet # Explorer # http://www.netscape.com/home/services_docs/html-extensions.html #======================================================================== sub AddExtension { local($extension) = @_; local(@extlist); local($element); if ($extension =~ /,/o) { @extlist = split(/\s*,\s*/, $extension); &AddExtension(shift @extlist) while @extlist > 0; return; } if ( $extension ne 'netscape' && $extension ne 'microsoft') { warn "$PROGRAM: unknown extension `$extension' -- ignoring.\n"; return; } #--------------------------------------------------------------------- # Microsoft extensions #--------------------------------------------------------------------- if ($extension eq 'microsoft') { #-- new element attributes for existing elements --------------------- foreach $element (keys %msAttributes) { $validAttributes{$element} = $msAttributes{$element}; } $pairElements .= '|'.$msPaired; $maybePaired .= '|COL|COLGROUP'; $attributeFormat{'LOOP'} = '\d+|INFINITE'; $okInHead{'BGSOUND'} = 1; $requiredAttributes{'BGSOUND'} = 'SRC'; $attributeFormat{'LEFTMARGIN'} = '\d+'; $attributeFormat{'TOPMARGIN'} = '\d+'; $requiredContext{'AREA'} = 'MAP'; $requiredAttributes{'MAP'} = 'NAME'; $requiredAttributes{'AREA'} = 'COORDS'; #-- MARQUEE attributes $attributeFormat{'BEHAVIOR'} = 'SCROLL|SLIDE|ALTERNATE'; $attributeFormat{'DIRECTION'} = 'LEFT|RIGHT'; $attributeFormat{'WIDTH'} = '\d+%?'; $attributeFormat{'HEIGHT'} = '\d+%?'; # attribute format check for attributes which take colors $attributeFormat{'ALINK'} = $attributeFormat{'VLINK'} = $attributeFormat{'LINK'} = $attributeFormat{'TEXT'} = $attributeFormat{'BGCOLOR'} = $attributeFormat{'COLOR'} = $attributeFormat{'BORDERCOLOR'} = $attributeFormat{'BORDERCOLORLIGHT'} = $colorRE; # LI TYPE= can take A, a, I, i, or 1 $attributeFormat{'TYPE'} .= '|[AaIi1]'; # the FRAME and RULES attributes for TABLE element $attributeFormat{'FRAME'} = 'VOID|ABOVE|BELOW|HSIDES|LHS|RHS|VSIDES|'. 'BOX|BORDER'; $attributeFormat{'RULES'} = 'NONE|GROUPS|ROWS|COLS|ALL'; # START attribute for IMG element $attributeFormat{'START'} = 'FILEOPEN|MOUSEOVER'; # VALUETYPE attribute for PARAM element (in an OBJECT) $attributeFormat{'VALUETYPE'} = 'DATA|REF|OBJECT'; $requiredContext{'THEAD'} = 'TABLE'; $requiredContext{'TBODY'} = 'TABLE'; $requiredContext{'TFOOT'} = 'TABLE'; $requiredContext{'COLGROUP'} = 'TABLE'; $requiredContext{'COL'} = 'COLGROUP'; $requiredContext{'PARAM'} = 'APPLET|OBJECT'; # COMMENT, LISTING, PLAINTEXT, and XMP are valid elements for Microsoft $obsoleteTags = ''; } #--------------------------------------------------------------------- # Netscape extensions #--------------------------------------------------------------------- if ($extension eq 'netscape') { #-- new element attributes for existing elements --------------------- foreach $element (keys %netscapeAttributes) { $validAttributes{$element} = $netscapeAttributes{$element}; } #-- formats for new attributes --------------------------------------- $attributeFormat{'SIZE'} = '[-+]?\d+'; $attributeFormat{'MARGINWIDTH'} = '\d+'; $attributeFormat{'MARGINHEIGHT'} = '\d+'; $attributeFormat{'SCROLLING'} = 'NO|YES|AUTO'; $attributeFormat{'TYPE'} .= '|[AaIi1]|disc|square|circle'; #-- attributes for EMBED element $attributeFormat{'TYPE'} .= '|.+'; $attributeFormat{'PALETTE'} = 'FOREGROUND|BACKGROUND'; #-- new elements ----------------------------------------------------- $pairElements .= '|'.$netscapePaired; $requiredContext{'AREA'} = 'MAP'; $requiredContext{'FRAME'} = 'FRAMESET'; $requiredContext{'KEYGEN'} = 'FORM'; $requiredAttributes{'MAP'} = 'NAME'; $requiredAttributes{'AREA'} = 'COORDS'; # this should really be specific to ROWS and COLS in FRAMESET element< $attributeFormat{'ROWS'} = $attributeFormat{'COLS'} = '\d+|(\d*[*%]?,)*\s*\d*[*%]?'; # this is for TEXTAREA $attributeFormat{'WRAP'} .= '|SOFT|HARD'; # attribute format check for attributes which take colors $attributeFormat{'ALIGN'} .= '|TEXTTOP|ABSMIDDLE|BASELINE|ABSBOTTOM'; # BASE can take just a TARGET attribute, HREF not required therefore delete $requiredAttributes{'BASE'}; $expectArgsRE .= '|FONT|BASE'; $okInHead{'SCRIPT'} = 1; } } sub AddAttributes { local($element,@attributes) = @_; local($attr); $attr = join('|', @attributes); if (defined $validAttributes{$element}) { $validAttributes{$element} .= "|$attr"; } else { $validAttributes{$element} = "$attr"; } } #======================================================================== # Function: ListWarnings() # Purpose: List all supported warnings, with identifier, and # whether the warning is enabled. #======================================================================== sub ListWarnings { local($id); local($message); foreach $id (sort keys %enabled) { ($message = $message{$id}) =~ s/\$argv\[\d+\]/.../g; $message =~ s/\\"/"/g; print WARNING "$id (", ($enabled{$id} ? "enabled" : "disabled"), ")\n"; print WARNING " $message\n\n"; } } sub CheckURL { local($url) = @_; local($workfile) = "$TMPDIR/$PROGRAM.$$"; local($urlget) = $variable{'url-get'}; die "$PROGRAM: url-get variable is not defined -- ". "don't know how to get $url\n" unless defined $urlget; system("$urlget $url > $workfile"); &WebLint($workfile, $url); unlink $workfile; } sub PrintToDo { print STDERR "$todo"; exit 0; } #======================================================================== # Function: wanted # Purpose: This is called by &find() to determine whether a file # is wanted. We're looking for files, with the filename # extension .html or .htm. #======================================================================== sub wanted { local($foundIndex); if (-d $_) { $foundIndex = 0; foreach $legalIndex (@dirIndices) { $foundIndex=1,last if -f "$_/$legalIndex"; } if (! $foundIndex) { &whine("$arg/$_", 'directory-index', "@dirIndices"); } } /\.($fileExtensions)$/ && # valid filename extensions: .html .htm -f $_ && # only looking for files (!$opt_l || !-l $_) && # ignore symlinks if -l given &WebLint($_,$name); # check the file } sub PopEndTag { $matched = pop @tags; pop @tagNums; $matchedLine = pop @taglines; #-- does top of stack match top of orphans stack? -------- while (@orphans > 0 && @tags > 0 && $orphans[$#orphans] eq $tags[$#tags]) { &whine($., 'element-overlap', $orphans[$#orphans], $orphanlines[$#orphanlines], $matched, $matchedLine); pop @orphans; pop @orphanlines; pop @tags; pop @tagNums; pop @taglines; } $tagRE = join('|',@tags); } #======================================================================== # Function: PickTmpdir # Purpose: Pick a temporary working directory. If TMPDIR environment # variable is set, then we try that first. #======================================================================== sub PickTmpdir { local(@options) = @_; local($tmpdir); @options = ($ENV{'TMPDIR'}, @options) if defined $ENV{'TMPDIR'}; foreach $tmpdir (@options) { return $tmpdir if -d $tmpdir && -w $tmpdir; } die "$PROGRAM: unable to find a temporary directory.\n", ' ' x (length($PROGRAM)+2), "tried: ",join(' ',@options),"\n"; } #======================================================================== # Function: ReadDefaults # Purpose: Read the built-in defaults. These are stored at the end # of the script, after the __END__, and read from the # DATA filehandle. #======================================================================== sub ReadDefaults { local(@elements); while () { chop; s/^\s*//; next if /^$/; push(@elements, $_); next unless @elements == 3; ($id, $default, $message) = @elements; $enabled{$id} = ($default eq 'ENABLE'); ($message{$id} = $message) =~ s/"/\\"/g; undef @elements; } } __END__ upper-case DISABLE tag <$argv[0]> is not in upper case. lower-case DISABLE tag <$argv[0]> is not in lower case. mixed-case ENABLE tag case is ignored here-anchor ENABLE bad form to use `here' as an anchor! require-head ENABLE no in HEAD element. once-only ENABLE tag <$argv[0]> should only appear once. I saw one on line $argv[1]! body-no-head ENABLE <BODY> but no <HEAD>. html-outer ENABLE outer tags should be <HTML> .. </HTML>. head-element ENABLE <$argv[0]> can only appear in the HEAD element. non-head-element ENABLE <$argv[0]> cannot appear in the HEAD element. obsolete ENABLE <$argv[0]> is obsolete. mis-match ENABLE unmatched </$argv[0]> (no matching <$argv[0]> seen). img-alt ENABLE IMG does not have ALT text defined. nested-element ENABLE <$argv[0]> cannot be nested -- </$argv[0]> not yet seen for <$argv[0]> on line $argv[1]. mailto-link DISABLE did not see <LINK REV=MADE HREF="mailto..."> in HEAD. element-overlap ENABLE </$argv[0]> on line $argv[1] seems to overlap <$argv[2]>, opened on line $argv[3]. unclosed-element ENABLE no closing </$argv[0]> seen for <$argv[0]> on line $argv[1]. markup-in-comment ENABLE markup embedded in a comment can confuse some browsers. unknown-attribute ENABLE unknown attribute "$argv[1]" for element <$argv[0]>. leading-whitespace ENABLE should not have whitespace between "<" and "$argv[0]>". required-attribute ENABLE the $argv[0] attribute is required for the <$argv[1]> element. unknown-element ENABLE unknown element <$argv[0]>. odd-quotes ENABLE odd number of quotes in element <$argv[0]>. heading-order ENABLE bad style - heading <$argv[0]> follows <H$argv[1]> on line $argv[2]. bad-link DISABLE target for anchor "$argv[0]" not found. expected-attribute ENABLE expected an attribute for <$argv[0]>. unexpected-open ENABLE unexpected < in <$argv[0]> -- potentially unclosed element. required-context ENABLE illegal context for <$argv[0]> - must appear in <$argv[1]> element. unclosed-comment ENABLE unclosed comment (comment should be: <!-- ... -->). illegal-closing ENABLE element <$argv[0]> is not a container -- </$argv[0]> not legal. extension-markup ENABLE <$argv[0]> is extended markup (use "-x <extension>" to allow this). extension-attribute ENABLE attribute `$argv[0]' for <$argv[1]> is extended markup (use "-x <extension>" to allow this). physical-font DISABLE <$argv[0]> is physical font markup -- use logical (such as $argv[1]). repeated-attribute ENABLE attribute $argv[0] is repeated in element <$argv[1]> must-follow ENABLE <$argv[0]> must immediately follow <$argv[1]> empty-container ENABLE empty container element <$argv[0]>. directory-index ENABLE directory does not have an index file ($argv[0]) closing-attribute ENABLE closing tag <$argv[0]> should not have any attributes specified. attribute-delimiter ENABLE use of ' for attribute value delimiter is not supported by all browsers (attribute $argv[0] of tag $argv[1]) img-size DISABLE setting WIDTH and HEIGHT attributes on IMG tag can improve rendering performance on some browsers. container-whitespace DISABLE $argv[0] whitespace in content of container element $argv[1] require-doctype DISABLE first element was not DOCTYPE specification literal-metacharacter ENABLE metacharacter '$argv[0]' should be represented as '$argv[1]' heading-mismatch ENABLE malformed heading - open tag is <$argv[0]>, but closing is </$argv[1]> bad-text-context ENABLE illegal context, <$argv[0]>, for text; should be in $argv[1]. attribute-format ENABLE illegal value for $argv[0] attribute of $argv[1] ($argv[2]) quote-attribute-value ENABLE value for attribute $argv[0] ($argv[1]) of element $argv[2] should be quoted (i.e. $argv[0]="$argv[1]") meta-in-pre ENABLE you should use '$argv[0]' in place of '$argv[1]', even in a PRE element. heading-in-anchor ENABLE <A> should be inside <$argv[0]>, not <$argv[0]> inside <A>. title-length ENABLE The HTML spec. recommends the TITLE be no longer than 64 characters.