#!/usr/bin/perl # HTML Article declutter # Daniel Beer # 6 Feb 2014 # # This file is in the public domain. use strict; use warnings; use HTML::TreeBuilder; use IO::HTML; # These tags are deemed to be points of decision-making when pruning. my %BLOCK_TAGS = ( h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1, div => 1, section => 1, header => 1, footer => 1, li => 1, ul => 1, ol => 1, dl => 1, dt => 1, dd => 1 ); ######################################################################## # Prune # # This is a heavy-handed filter which strips out large chunks of the # page based on class/id keywords and tag types. # # Pruning is top-down. ######################################################################## sub poison { my ($id) = @_; $id = lc($id || ''); return 1 if $id =~ /comment/; return 1 if $id =~ /^footer/; return 1 if $id =~ /login/; return 1 if $id =~ /popup/; return 1 if $id =~ /promo/; return 1 if $id =~ /related/; return 1 if $id =~ /share/; return 1 if $id =~ /slideshow/; return 1 if $id =~ /links/; return 1 if $id =~ /sidebar/; return 1 if $id =~ /search/; return 0; } sub should_prune { my ($e) = @_; my $tag = lc($e->tag); return 1 if $tag eq 'script'; return 1 if $tag eq 'noscript'; return 1 if $tag eq 'header'; return 1 if $tag eq 'footer'; return 1 if $tag eq 'style'; return 1 if $tag eq 'img'; return 1 if $tag eq 'nav'; if ($BLOCK_TAGS{$tag}) { return 1 if poison($e->id); return 1 if poison($e->attr("class")); } return 0; } sub prune { my ($e) = @_; if (should_prune($e)) { $e->detach; $e->delete; } else { foreach ($e->content_list) { prune($_) if ref $_; } } } ######################################################################## # Shake # # This filter is a heuristic filter. It examines link density and # word-counts to try and differentiate link-spam from article content. # # It makes these decisions bottom-up, deciding only for tags in # %BLOCK_TAGS. If it's decided that a subtree should be kept, this # decision propagates upwards (we can't delete a tree if it contains # a subtree which we've decided to keep). # # Some kinds of tags can confer a blessing on their descendants, which # prevents removal. ######################################################################## sub sesame { my ($id) = @_; $id = lc($id || ''); return 1 if $id =~ /author/; return 1 if $id =~ /byline/; return 0; } sub blessed { my ($e) = @_; return 1 if lc($e->tag) eq 'a' && ($e->attr("rel") || '') eq "author"; return 1 if sesame($e->attr("id")); return 1 if sesame($e->attr("class")); return 0; } sub word_count { my @w = split(/[^0-9A-Za-z]/, $_[0]); my $c = 0; foreach (@w) { $c++ if length($_) > 0; } return $c; } sub shake { my ($e, $pstat, $pcontext) = @_; my $wc = 0; my $tag = lc($e->tag); my %context = %{$pcontext || {}}; my %stat = ( LinkWords => 0, Words => 0, Tags => 1, Keep => 0 ); $context{Link} = 1 if $tag eq 'a'; $context{Blessed} = 1 if blessed($e); foreach ($e->content_list) { if (ref $_) { shake($_, \%stat, \%context); } else { $stat{$context{Link} ? "LinkWords" : "Words"} += word_count($_); } } $stat{Keep} = 1 if $context{Blessed}; if (defined($BLOCK_TAGS{$tag})) { my $score = $stat{Words} - $stat{LinkWords} * 2 - $stat{Tags}; $stat{Keep} = 1 if $score >= 0; unless ($stat{Keep}) { $e->detach; $e->delete; } } if (defined($pstat)) { $pstat->{$_} += $stat{$_} foreach(keys %stat); } } ######################################################################## # Purify # # In this filter we flatten the document structure, removing most markup # and attributes. The text content is kept. ######################################################################## my %PURE_TAGS = ( 'p' => [], 'pre' => [], 'br' => [], 'em' => [], 'strong' => [], 'h1' => [], 'h2' => [], 'h3' => [], 'h4' => [], 'h5' => [], 'html' => [], 'body' => [], 'head' => [], 'title' => [], 'meta' => ['http-equiv', 'name', 'content'], 'li' => [], 'ol' => [], 'ul' => [], 'dl' => [], 'dt' => [], 'dd' => [], 'blockquote' => [] ); sub purify { my ($e) = @_; my (@children); return $e unless ref $e; foreach ($e->content_list) { push @children, purify($_); } my $t = lc($e->tag); my $atlist = $PURE_TAGS{$t}; return @children unless defined($atlist); my @attrs; foreach (@{$atlist}) { my $v = $e->attr($_); push @attrs, $_, $v if defined($v); } return () if $t ne 'br' && $#children < 0 && $#attrs < 0; my $n = HTML::Element->new($t, @attrs); foreach (@children) { $n->push_content($_); } return $n; } ######################################################################## # Top-level # # Read a file, guessing encoding, then prune => shake => purify. ######################################################################## sub add_comment { my ($e, $comment) = @_; return unless ref($e); if (lc($e->tag) eq 'body') { my $p = HTML::Element->new("p"); $p->push_content($comment); $e->push_content($p); return 1; } foreach ($e->content_list) { return if add_comment($_, $comment); } } my $filename = shift || die "You must specfy an input file"; my $comment = shift; $IO::HTML::default_encoding = 'UTF-8'; my $root = HTML::TreeBuilder->new_from_file((IO::HTML::html_file($filename))); prune $root; shake $root; my $clean = purify($root); add_comment($clean, $comment) if defined($comment); print $clean->as_HTML; $clean->delete; $root->delete;