#!/yer/perl/here use strict; =head1 NAME XML::Handler::RSS - RSS to hash-ref parser =head1 SYNOPSIS use XML::Handler::RSS; use XML::SAX::ParserFactory; $XML::SAX::ParserPackage = "XML::SAX::Expat"; use LWP::Simple; use Data::Dumper; my $handler = XML::Handler::RSS->new(Clarkify=>1); my $parser = XML::SAX::ParserFactory->parser(Handler=>$handler); # Because the Expat parser doesn't know how # to fetch remote URIs; the PurePerl parser would # be able to deal with : $parser->parse_uri(RSS) $parser->parse_string(get($ARGV[0])); print "$feed:\n".&Dumper($handler->rss()); =head1 DESCRIPTION XML::Handler::RSS is a RSS to hash-ref parser. It is not a general purpose XML to hash parser, nor is it especially smart about adding new elements not already accounted for in the code. It's not even that much faster than XML::Handler::2Simple. Slower even : Benchmark: timing 10000 iterations of XML::Handler::2Simple, XML::Handler::RSS... XML::Handler::2Simple: 5617 wallclock secs (4834.35 usr + 16.48 sys = 4850.84 CPU) @ 2.06/s (n=10000) XML::Handler::RSS: 6032 wallclock secs (5210.20 usr + 13.90 sys = 5224.10 CPU) @ 1.91/s (n=10000 It can, however, de-Clark-ify element names and tries to be smart about using references for elements that occur more than once, for example. But, someone asked me about parsing RSS with SAX and it's been a good exercise. =cut package RSSMunger; use base qw (XML::SAX::Base); $XML::Handler::RSS::VERSION = '0.1'; use constant LOCAL_NS => { "" => "http://purl.org/rss/1.0/", "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdfs" => "http://www.w3.org/2000/01/rdf-schema#", "sy" => "http://purl.org/rss/1.0/modules/syndication/", "admin" => "http://webns.net/mvcb/", "annotate" => "http://purl.org/rss/1.0/modules/annotate/", "dcterms" => "http://purl.org/dc/terms/", "cc" => "http://web.resource.org/cc/", "content" => "http://purl.org/rss/1.0/modules/content/", "foaf" => "http://xmlns.com/foaf/0.1/", "trackback" => "http://madskills.com/public/xml/rss/module/trackback/", "dublincore" => "http://purl.org/dc/elements/1.1/", }; =head1 PACKAGE METHODS =head2 __PACKAGE__->new(%args) This method is subclassed from I and accepts the following additional parameters : =over 4 =item * B Boolean. Replace namespace prefixes, in key names, with hideously ugly but correct James Clark-ian namespace syntax. For example, dc:creator ...would become... {http://purl.org/dc/elements/1.1/}creator Default is 0 =back =cut sub new { my $pkg = shift; my $class = ref($pkg) || $pkg; my $args = ref($_[0] eq "HASH") ? $_[0] : {@_}; my $self = {}; bless $self,$class; $self->{"__clarkify"} = $args->{'Clarkify'}; $self->{"__rss"} = {}; $self->SUPER::new($args); return $self; } sub start_document { my $self = shift; $self->{'__rss'} = {}; $self->SUPER::start_document(); } sub end_document { my $self = shift; $self->SUPER::end_document(); return $self->{'__rss'}; } sub start_element { my $self = shift; my $data = shift; $self->{'__level'} ++; # Note that I am usually checking the value of # 'LocalName' which doesn't contain the prefix. # If I cared about the prefix I would also # check $self->_isa($name,$data) # However, here we use of 'Name' rather than # 'LocalName' in order to preserve the ns prefix # attached to the feed. # This is asking for trouble, but hey we're only # at version 0.2 $self->{'__last'} = $data->{Name}; # Hello, this is Namespace Lala-land calling! $self->_parse_xmlns($data->{Attributes}); # if ($data->{LocalName} eq "channel") { $self->{'__channel'} = 1; return 1; } if ($self->{'__channel'}) { $self->_do_channel($data); return 1; } # if ($data->{LocalName} eq "item") { $self->{'__item'} = 1; # We predeclare the hash so that we don't # accidentally get weird errors later on $self->{'__rss'}{items}->[$self->{'__curitem'}] = {}; return 1; } # if ($self->{'__item'}) { $self->_do_item($data); return 1; } # print "$data->{Name} ($data->{LocalName})\n"; return 1; } sub end_element { my $self = shift; my $data = shift; if ($data->{LocalName} eq "channel") { $self->{'__channel'} = 0; } # dublincore if ($self->_isa("dublincore",$data)) { if ($data->{LocalName} eq "creator") { if (my $foaf = $self->_set_curfoaf()) { $self->_set([ $self->{"__rss"}{"channel"}, $data->{Name}, $foaf, ]); } $self->{"__curfoaf"} = {}; $self->{"__dc:creator"} = 0; } if ($data->{LocalName} eq "contributor") { if (my $foaf = $self->_set_curfoaf()) { my $contributors = $self->_element_name(join(":", $self->{"__ns"}{LOCAL_NS->{"dublincore"}}, "contributors")); $self->{'__rss'}{"items"}->[$self->{"__curitem"}]->{"$contributors"} ||= []; $self->_set([ $self->{'__rss'}{"items"}->[$self->{"__curitem"}], "$contributors", $foaf, ]); } $self->{"__curfoaf"} = {}; $self->{"__curcontrib"} ++; $self->{"__dc:creator"} = 0; } } # /dublincore if ($data->{LocalName} eq "item") { $self->{"__curcontrib"} = 0; $self->{"__item"} = 0; # Because computers have a zero-based # count which I'm assured makes perfect # sense. But I studied painting, and it # just bugs my ass, all the same. $self->{'__curitem'} ++; } $self->{'__level'} --; return 1; } sub characters { my $self = shift; my $data = shift; # I'm a bit perplexed that I # still need to do this. It # may be the funny MacOS return # characters on Ben's feed. $data->{Data} =~ s/^(\s)+//m; $data->{Data} =~ s/(\s)$//m; if (! $data->{Data}) { # You may want to finesse # this but since Perl will auto- # vivify anything hash elements # that don't get explicitly added, # maybe not. return 0; } if ($self->{'__item'}) { $self->_set_item_prop({$self->{'__last'} => $data->{Data}}); } if ($self->{'__channel'}) { $self->_set_channel_prop({$self->{'__last'} => $data->{Data}}); } return 1; } sub _do_channel { my $self = shift; my $data = shift; if (($self->{LocalName} eq "items") && ($self->_isa("rdf",$data))) { # Fuck off and die! # You're a bad idea that should have never # have made it off of the napkin! # If I've upset you, use base qw (__PACKAGE__) # No amount of convincing will change my mind. return 0; } # if (($data->{LocalName} eq "creator") && ($self->_isa("dublincore",$data))) { $self->{'__dc:creator'} = 1; return 1; } # if ($self->_isa("foaf",$data)) { if ($self->{'__dc:creator'}){ $self->{"__curfoaf"} ||= {}; $self->_do_foaf($self->{"__curfoaf"},$data); } return 1; } # if (my $resource = $self->_get_rdf_resource($data->{Attributes})) { $self->_set_channel_prop({$data->{Name}=>$resource}); return 1; } # return 1; } sub _do_item { my $self = shift; my $data = shift; # if (($self->_isa("dublincore",$data)) && ($data->{LocalName} eq "contributor")) { $self->{'__dc:contributor'} = 1; return 1; } # if ($self->_isa("foaf",$data)) { if ($self->{"__dc:contributor"}) { $self->{"__curfoaf"} ||= {}; $self->_do_foaf($self->{"__curfoaf"},$data); } return 1; } # if (my $resource = $self->_get_rdf_resource($data->{Attributes})) { $self->_set_channel_prop({$data->{Name}=>$resource}); return 1; } # return 1; } sub _do_foaf { my $self = shift; my $ref = shift; my $data = shift; my $name = $self->_element_name($data->{Name}); my $prop = $self->_get_foaf_prop($data); $self->_set([$ref,$name,$prop]); return 1; } sub _set_channel_prop { my $self = shift; my $props = shift; foreach my $key (keys %$props) { my $name = $self->_element_name($key); my $ref = $self->{'__rss'}{"channel"}; $self->_set([$ref,$name,$props->{$key}]); } return 1; } sub _set_item_prop { my $self = shift; my $props = shift; foreach my $key (keys %$props) { # Presumably, this would be finessed # so that some things would be pushed # and others just appeneded my $name = $self->_element_name($key); my $ref = $self->{'__rss'}{"items"}->[$self->{'__curitem'}]; $self->_set([$ref,$name,$props->{$key}]); } return 1; } sub _set_curfoaf { my $self = shift; if (keys %{$self->{"__curfoaf"}}) { my $cur_foaf = $self->{'__curfoaf'}; my $prefix = $self->{'__ns'}{LOCAL_NS->{"foaf"}}; my $foafname = $self->_element_name("$prefix:Person"); # Note the use of $data->{Name} which will # contain whatever random prefix Joe User # has assigned the Dublin Core creator element $self->{"__rss"}{"foaf"}{$cur_foaf->{$foafname}} ||= $cur_foaf; return $self->{"__rss"}{"foaf"}{$cur_foaf->{$foafname}}; } return undef; } sub _set { my $self = shift; my $what = shift; # Aaaaaaaaahhhhhhhhh!!!! # My eyes! They burn!! # $what->[0] = data reference # $what->[1] = el name (note double quotes in # case of clark-ification of name) # $what->[2] = el property if (ref($what->[0]->{"$what->[1]"})) { push @{$what->[0]->{"$what->[1]"}}, $what->[2]; } else { $what->[0]->{"$what->[1]"} = $what->[2]; } } sub _get_rdf_resource { my $self = shift; return $self->_get_attribute(LOCAL_NS->{"rdf"},"resource",@_); } sub _get_rdf_about { my $self = shift; return $self->_get_attribute(LOCAL_NS->{"rdf"},"about",@_); } sub _get_foaf_prop { my $self = shift; my $data = shift; if ($data->{LocalName} eq "Person") { return $self->_get_attribute($data->{NamespaceURI},"name",$data->{Attributes}); } else { return $self->_get_rdf_resource($data->{Attributes}); } } sub _get_attribute { my $self = shift; my $nsuri = shift; my $name = shift; my $attrs = shift; foreach my $ns (keys %$attrs) { next unless $attrs->{$ns}->{NamespaceURI} eq $nsuri; next unless $attrs->{$ns}->{LocalName} eq $name; return $attrs->{$ns}->{Value}; } return undef; } sub _parse_xmlns { my $self = shift; my $attrs = shift; foreach my $ns (keys %$attrs) { # We could just to a regex(p) on $ns # but why waste the computrons when # we can write these nasty SAX2 queries? next unless ($attrs->{$ns}->{Prefix} eq "xmlns"); # This is really just for readability my $ns_uri = $attrs->{$ns}->{Value}; my $ns_prefix = $attrs->{$ns}->{LocalName}; # Presumably, there should be some check # for conflicts here # This is probably a good candidate for some # fancy pants liked-list kung-fu that only # the equivalent of Darida-esque Perl programmer # would understand. $self->{'__ns'}{$ns_uri} = $ns_prefix; $self->{'__prefix'}->{$ns_prefix} = $ns_uri; } } sub _isa { my $self = shift; my $ns = shift; my $data = shift; return ($data->{NamespaceURI} eq LOCAL_NS->{$ns}); } sub _element_name { my $self = shift; return ($self->{'__clarkify'}) ? $self->_clarkify($_[0]) : $_[0]; } sub _clarkify { my $self = shift; my $element = shift; $element =~ /^(?:([^\:]+):)?(.*)$/; my $ns = ($1) ? $self->{'__prefix'}{$1} : LOCAL_NS->{""}; return "{$ns}$2"; } sub _declarkify { my $self = shift; my $element = shift; $element =~ s/^\{([^}]+)\}(.*)$/$self->{'__ns'}{$1}:$2/; return $element; } =head1 VERSION 0.2 =head1 DATE October 05, 2002 =head1 AUTHOR Aaron Straup Cope =head1 SEE ALSO L =head1 LICENSE Copyright (c) 2002, Aaron Straup Cope. All Rights Reserved. This is free software, you may use it and distribute it under the same terms as Perl itself. =cut package main; use XML::Handler::2Simple; use XML::SAX::ParserFactory; $XML::SAX::ParserPackage = "XML::SAX::Expat"; use Data::Dumper; use LWP::Simple; use Benchmark; use constant RSS => "http://rss.benhammersley.com/index.rdf"; { &main(); exit; } sub main { my $feed = $ARGV[0] || RSS; &rss($feed); } sub benchamrk { timethese(10000, { 'XML::Handler::RSS' => \&rss, 'XML::Handler::2Simple' => \&simple }); } sub rss { my $handler = RSSMunger->new(Clarkify=>1); my $parser = XML::SAX::ParserFactory->parser(Handler=>$handler); my $rss = $parser->parse_string(get($_[0])); print &Dumper($rss); } sub simple { my $handler = XML::Handler::2Simple->new(forcearray=> [ "item" ], keyattr => [ ], suppressempty=>undef); my $parser = XML::SAX::ParserFactory->parser(Handler=>$handler); my $p = $parser->parse_string(get($_[0])); }