#!/usr/bin/perl
# HTML Article declutter
# Daniel Beer
# 6 Feb 2014
#
# This file is in the public domain.
use strict;
use warnings;
use HTML::TreeBuilder;
use IO::HTML;
# These tags are deemed to be points of decision-making when pruning.
my %BLOCK_TAGS = (
h1 => 1,
h2 => 1,
h3 => 1,
h4 => 1,
h5 => 1,
h6 => 1,
div => 1,
section => 1,
header => 1,
footer => 1,
li => 1,
ul => 1,
ol => 1,
dl => 1,
dt => 1,
dd => 1
);
########################################################################
# Prune
#
# This is a heavy-handed filter which strips out large chunks of the
# page based on class/id keywords and tag types.
#
# Pruning is top-down.
########################################################################
sub poison {
my ($id) = @_;
$id = lc($id || '');
return 1 if $id =~ /comment/;
return 1 if $id =~ /^footer/;
return 1 if $id =~ /login/;
return 1 if $id =~ /popup/;
return 1 if $id =~ /promo/;
return 1 if $id =~ /related/;
return 1 if $id =~ /share/;
return 1 if $id =~ /slideshow/;
return 1 if $id =~ /links/;
return 1 if $id =~ /sidebar/;
return 1 if $id =~ /search/;
return 0;
}
sub should_prune {
my ($e) = @_;
my $tag = lc($e->tag);
return 1 if $tag eq 'script';
return 1 if $tag eq 'noscript';
return 1 if $tag eq 'header';
return 1 if $tag eq 'footer';
return 1 if $tag eq 'style';
return 1 if $tag eq 'img';
return 1 if $tag eq 'nav';
if ($BLOCK_TAGS{$tag}) {
return 1 if poison($e->id);
return 1 if poison($e->attr("class"));
}
return 0;
}
sub prune {
my ($e) = @_;
if (should_prune($e)) {
$e->detach;
$e->delete;
} else {
foreach ($e->content_list) {
prune($_) if ref $_;
}
}
}
########################################################################
# Shake
#
# This filter is a heuristic filter. It examines link density and
# word-counts to try and differentiate link-spam from article content.
#
# It makes these decisions bottom-up, deciding only for tags in
# %BLOCK_TAGS. If it's decided that a subtree should be kept, this
# decision propagates upwards (we can't delete a tree if it contains
# a subtree which we've decided to keep).
#
# Some kinds of tags can confer a blessing on their descendants, which
# prevents removal.
########################################################################
sub sesame {
my ($id) = @_;
$id = lc($id || '');
return 1 if $id =~ /author/;
return 1 if $id =~ /byline/;
return 0;
}
sub blessed {
my ($e) = @_;
return 1 if lc($e->tag) eq 'a' &&
($e->attr("rel") || '') eq "author";
return 1 if sesame($e->attr("id"));
return 1 if sesame($e->attr("class"));
return 0;
}
sub word_count {
my @w = split(/[^0-9A-Za-z]/, $_[0]);
my $c = 0;
foreach (@w) {
$c++ if length($_) > 0;
}
return $c;
}
sub shake {
my ($e, $pstat, $pcontext) = @_;
my $wc = 0;
my $tag = lc($e->tag);
my %context = %{$pcontext || {}};
my %stat = (
LinkWords => 0,
Words => 0,
Tags => 1,
Keep => 0
);
$context{Link} = 1 if $tag eq 'a';
$context{Blessed} = 1 if blessed($e);
foreach ($e->content_list) {
if (ref $_) {
shake($_, \%stat, \%context);
} else {
$stat{$context{Link} ? "LinkWords" : "Words"} += word_count($_);
}
}
$stat{Keep} = 1 if $context{Blessed};
if (defined($BLOCK_TAGS{$tag})) {
my $score = $stat{Words} - $stat{LinkWords} * 2 - $stat{Tags};
$stat{Keep} = 1 if $score >= 0;
unless ($stat{Keep}) {
$e->detach;
$e->delete;
}
}
if (defined($pstat)) {
$pstat->{$_} += $stat{$_} foreach(keys %stat);
}
}
########################################################################
# Purify
#
# In this filter we flatten the document structure, removing most markup
# and attributes. The text content is kept.
########################################################################
my %PURE_TAGS = (
'p' => [],
'pre' => [],
'br' => [],
'em' => [],
'strong' => [],
'h1' => [],
'h2' => [],
'h3' => [],
'h4' => [],
'h5' => [],
'html' => [],
'body' => [],
'head' => [],
'title' => [],
'meta' => ['http-equiv', 'name', 'content'],
'li' => [],
'ol' => [],
'ul' => [],
'dl' => [],
'dt' => [],
'dd' => [],
'blockquote' => []
);
sub purify {
my ($e) = @_;
my (@children);
return $e unless ref $e;
foreach ($e->content_list) {
push @children, purify($_);
}
my $t = lc($e->tag);
my $atlist = $PURE_TAGS{$t};
return @children unless defined($atlist);
my @attrs;
foreach (@{$atlist}) {
my $v = $e->attr($_);
push @attrs, $_, $v if defined($v);
}
return () if $t ne 'br' && $#children < 0 && $#attrs < 0;
my $n = HTML::Element->new($t, @attrs);
foreach (@children) {
$n->push_content($_);
}
return $n;
}
########################################################################
# Top-level
#
# Read a file, guessing encoding, then prune => shake => purify.
########################################################################
sub add_comment {
my ($e, $comment) = @_;
return unless ref($e);
if (lc($e->tag) eq 'body') {
my $p = HTML::Element->new("p");
$p->push_content($comment);
$e->push_content($p);
return 1;
}
foreach ($e->content_list) {
return if add_comment($_, $comment);
}
}
my $filename = shift || die "You must specfy an input file";
my $comment = shift;
$IO::HTML::default_encoding = 'UTF-8';
my $root = HTML::TreeBuilder->new_from_file((IO::HTML::html_file($filename)));
prune $root;
shake $root;
my $clean = purify($root);
add_comment($clean, $comment) if defined($comment);
print $clean->as_HTML;
$clean->delete;
$root->delete;