#!/yer/perl/here use strict; # amphetasax - SAX2 RSS to hash parser # Version 0.1, October 04 2002 # Copyright (c) 2002, Aaron Straup Cope. All Right Reserved. # This is free software, you may use it and distribute it under the same terms as Perl itself. package RSSMunger; use base qw (XML::SAX::Base); # Because you're not allowed to see the cat. sub rss { my $self = shift; return $self->{"__rss"}; } sub start_element { my $self = shift; my $data = shift; $self->{'__level'} ++; # Note the use of 'Name' rather than # 'LocalName'; this preserves the ns prefix $self->{'__last'} = $data->{Name}; if ($self->{'__level'} == 1) { # Hello, this is Namespace Lala-land calling! $self->_parse_xmlns($data->{Attributes}); } if ($data->{LocalName} eq "channel") { $self->{'__channel'} = 1; return 1; } if ($data->{LocalName} eq "item") { $self->{'__item'} = 1; # We predeclare the hash so that we don't # accidentally get weird errors later on $self->{'__rss'}{items}->[$self->{'__curitem'}] = {}; return 1; } if ($self->{'__item'}) { # Note that I am checking the value of # 'Name' which doesn't contain the prefix. # If I cared about the prefix I would also # check $data->{Prefix} # You could also compare "foo:bar" against # $data->{LocalName} but that seems a bit less # elegant to me - it's also more to compare # unless you really care about the prefix # See if we have any new namespaces # Why? Because we love namespaces! $self->_parse_xmlns($data->{Attributes}); # This is one of those things that's fine to # do for one or two elements, but quickly # becomes unmaintainable with more. if ($data->{LocalName} =~ /^(reference|license)$/) { if (my $resource = $self->_get_rdf_resource($data->{Attributes})) { # Again, note that we are passing Name, # rather than LocalName to preserve the prefixes # SAX loves hash references so we love them too! $self->_set_item_prop({$data->{Name}=>$resource}); } return 1; } } # print "$data->{Name} ($data->{LocalName})\n"; return 1; } sub end_element { my $self = shift; my $data = shift; if ($data->{LocalName} eq "channel") { $self->{'__channel'} = 0; } if ($data->{LocalName} eq "item") { $self->{'__item'} = 0; # Because computers have a zero-based # count which I'm assured makes perfect # sense. But I studied painting, and it # just bugs my ass, all the same. $self->{'__curitem'} ++; } $self->{'__level'} --; return 1; } sub characters { my $self = shift; my $data = shift; # I'm a bit perplexed that I # still need to do this. It # may be the funny MacOS return # characters on Ben's feed. $data->{Data} =~ s/^(\s)+//m; $data->{Data} =~ s/(\s)$//m; if (! $data->{Data}) { # You may want to finesse # this but since Perl will auto- # vivify anything hash elements # that don't get explicitly added, # maybe not. return 0; } if ($self->{'__item'}) { $self->_set_item_prop({$self->{'__last'} => $data->{Data}}); } return 1; } sub _set_item_prop { my $self = shift; my $props = shift; foreach (keys %$props) { # Presumably, this would be finessed # so that some things would be pushed # and others just appeneded if (ref($self->{'__rss'}{items}->[$self->{'__curitem'}]->{$_})) { push @{$self->{'__rss'}{items}->[$self->{'__curitem'}]->{$_}}, $props->{$_}; } else { $self->{'__rss'}{items}->[$self->{'__curitem'}]->{$_} = $props->{$_}; } } return 1; } sub _get_rdf_resource { my $self = shift; my $attrs = shift; foreach my $ns (keys %$attrs) { next unless $attrs->{$ns}->{Name} eq "rdf:resource"; # Be anal-retentive and # check NamespaceURI here return $attrs->{$ns}->{Value}; } return undef; } sub _parse_xmlns { my $self = shift; my $attrs = shift; foreach my $ns (keys %$attrs) { # We could just to a regex(p) on $ns # but why waste the computrons when # we can write these nasty SAX2 queries? next unless ($attrs->{$ns}->{Prefix} eq "xmlns"); # This is really just for readability my $ns_uri = $attrs->{$ns}->{Value}; my $ns_prefix = $attrs->{$ns}->{LocalName}; # Presumably, there should be some check # for conflicts here $self->{'__ns'}{$ns_uri} = $ns_prefix; } } package main; use XML::SAX::ParserFactory; $XML::SAX::ParserPackage = "XML::SAX::Expat"; use LWP::Simple; use Data::Dumper; use constant RSS => "http://rss.benhammersley.com/index.rdf"; my $munger = RSSMunger->new(); my $parser = XML::SAX::ParserFactory->parser(Handler=>$munger); # Because the Expat parser doesn't know how # to fetch remote URIs; the PurePerl parser would # be able to deal with : $parser->parse_uri(RSS) $parser->parse_string(get(RSS)); print &Dumper($munger->rss());