概要: WebPage を解析して Feed URI を抽出して購読するプラグイン
Web Page を解析して Feed 購読っていうのだと Subscription::XPath とかあるわけだけど、 自由度がいまいち低いし、正規表現でも引き抜けたらいいんじゃね?ということで作ってみた。
package Plagger::Plugin::Subscription::Parse; use strict; use warnings; use URI; use Encode (); use HTML::TreeBuilder::XPath; use HTML::Entities (); use Plagger::Util; use Plagger::Tag; use Plagger::Feed; use base qw( Plagger::Plugin ); sub register { my ( $self, $c ) = @_; $c->register_hook( $self, 'subscription.load' => $self->can('load'), ); } sub load { my ( $self, $c ) = @_; my $confs = $self->conf->{'subscribe'}; $confs = [[ $confs ]] if ( ref $confs ne 'ARRAY' ); my $default = $self->conf->{'default'}; $default->{'xpath'} ||= { url => '//a/@href', title => '//a/@title', }; for my $conf ( @{ $confs } ) { $conf = { target => $conf } if ( ! ref $conf ); my $uri = $conf->{'target'} or $c->log( error => "Target url is missing." ) and next; Encode::_utf8_off( $uri ); for my $name ( qw( xpath regexp after_hook ) ) { $conf->{$name} ||= $default->{$name}; } my $meta = {}; $meta->{'link'} = $conf->{'target'}; for my $name ( qw( title meta tag ) ) { $meta->{$name} = $conf->{$name} || $default->{$name}; } if ( $meta->{'tag'} && ! ref $meta->{'tag'} ) { my $tags = [[ Plagger::Tag->parse( $meta->{'tag'} ) ]] ; $meta->{'tag'} = $tags; } $c->log( info => "Parse $uri" ); my $xhtml = Plagger::Util::load_uri( URI->new( $uri ), $self ); $xhtml = Plagger::Util::decode_content( $xhtml ); my $title; if ( $xhtml =~ m{<title>\s*(.*?)\s*</title>}i ) { $title = HTML::Entities::decode( $1 ); } $meta->{'title'} ||= $title || $meta->{'link'}; my $datas; if ( defined $conf->{'xpath'} ) { $datas = $self->find_node( $c, $conf->{'target'}, $xhtml, $conf->{'xpath'} ) or next; } else { $datas = $self->parse( $c, $conf->{'target'}, $xhtml, $conf->{'regexp'} ) or next; } if ( $conf->{'after_hook'} ) { for my $data ( @{ $datas } ) { eval $conf->{'after_hook'}; $c->error( $@ ) if ( $@ ); } } for my $feed ( $self->create_feed( $c, $datas, $meta ) ) { $c->subscription->add( $feed ); } } } sub find_node { my ( $self, $c, $url, $xhtml, $conf ) = @_; if ( ! $conf->{'url'} ) { $c->log( error => 'url' XPath is not specified. :$url" ); return; } my $tree = HTML::TreeBuilder::XPath->new; $tree->parse( $xhtml ); $tree->eof; no warnings 'redefine'; local *HTML::Element::_xml_escape = $self->can('xml_escape'); use warnings; my $data = []; for my $capture ( keys %{ $conf } ) { next if ( ! $conf->{$capture} ); my $index = 0; for my $child ( $tree->findnodes( $conf->{$capture} ) ) { my $value = ( $child->isElementNode ) ? $child->as_XML : $child->getValue ; $data-> [[$index]] = {} if ( ! ref $data-> [[$index]] ); $data-> [[$index]] ->{$capture} = $value; $index++; } } return $data; } sub parse { my ( $self, $c, $url, $xhtml, $conf ) = @_; my @capture = split m{\s+}, $conf->{'capture'}; if ( ! @capture ) { $c->log( error => "There is no available 'capture'. :$url" ); return; } if ( ! $conf->{'extract'} ) { $c->log( error => 'extrace' is not specified. :$url" ); return; } my $extract = Plagger::Util::decode_content( $conf->{'extract'} ); my ( $cur_pos, $prev_pos ) = ( 0, 0 ); my $data = []; my $index = 0; while ( 1 ) { if ( $xhtml =~ m{$extract}sg ) { $cur_pos = pos $xhtml; my $str = substr $xhtml, $prev_pos, length( $xhtml ); if ( my @matched = ( $xhtml =~ m{$extract}s ) ) { for my $name ( @capture ) { $data-> [[$index]] = {} if ( ! ref $data-> [[$index]] ); $data-> [[$index]] ->{$name} = shift @matched; } $index++; $prev_pos = $cur_pos; } else { last; } } else { last; } } return $data; } sub create_feed { my ( $self, $c, $datas, $meta ) = @_; my @feeds = (); for my $data ( @{ $datas } ) { my $feed = Plagger::Feed->new; $feed->url( URI->new_abs( $data->{'url'}, $meta->{'link'} ) ); $feed->title( $data->{'title'} || $meta->{'title'} ); $feed->link( $meta->{'link'} ); $feed->meta( $meta->{'meta'} ) if ( $meta->{'meta'} ); $feed->tags( $meta->{'tag'} ) if ( $meta->{'tag'} ); push @feeds, $feed; } return @feeds; } sub xml_escape { for my $x ( @_ ) { $x = Plagger::Util::encode_xml( $x ); } } 1; __END__ =head1 NAME Plagger::Plugin::Subscription::Parse - Use XPath or Regexp to extract subscriptions from web pages =head1 SYNOPSIS - module: Subscription::Parse config: subscribe: - target: http://d.hatena.ne.jp/antipop/20050628/1119966355 xpath: url: //ul [[@class="xoxo" or @class="subscriptionlist"]] //a/@href title: //ul [[@class="xoxo" or @class="subscriptionlist"]] //a/@title =head1 DESCRIPTION This plugin extracets subscriptions to XHTML content, using XPath or Regular expression to find links. =head1 CONFIG =head2 subscribe =over 7 =item target URI of the analyzed web page =item xpath XPath expression of each data is specified. =over 3 =item url XPath expression which extracts subscriptions URI out =item title XPath expression which extracts subscriptions title out =item other name When specified XPath expression besides the name of two above, it's stocked in C<$data->{'name'}>. =back =item regexp Regular expression to extract each data out. =over 2 =item extract Regular expression to extract data out =item capture The name of text that matches to the regular expression is specified. It is delimited in space. Data is stocked C<$data->{$name}>. =back =item after_hook Perl Code to process the acquired data is specified. =item title The title is specified. =item meta The meta data is specified. =item tag tags are specified. =back =head2 default Value of default when there is no individual specification =over 5 =item xpath =item regexp =item after_hook =item meta =item tag =back =head1 AUTHOR Naoki Okamura (Nyarla,) E<lt>thotep@nyarla.netE<gt> =head1 LICENSE This Plug-in is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L<Plagger> =cut
使い方はこんな感じ。細かい設定に関しては POD を参照。 機械翻訳なんで若干英文が怪しいかも。
plugins: - module: Subscription::Parse config: subscribe: - target: http://localhost/list.html xpath: url: //ul/li/a/@href title //ul/li/a/@title
CutomFeed::Config とか Filter::EntryFullText、Subscription::Config 辺りを参考にして作った。
Web Page を解析して URI 引っこ抜いて購読するっていうのはこれで大抵まかなえると思う。
XPath と正規表現が使えるし、after_hook
で引っこ抜いた内容をいじるってこともできるので、
結構柔軟に色々できるはず。
ちなみに plugin-start.pl 使って作ったのでテストとかもちゃんと書いた。 書いたんだけど、エントリに貼り付けると相当長くなりそうなので省略しました。 あと、コミットできる状態だけど、本家のリポジトリにコミットする勇気がない。
まあ、テストとかも公開したいんだけど、公開できるリポジトリがないので微妙。 Google Code Project Hosting とかあるけど、あれはあれで微妙なところがあるので、 なんとかならないかな、という感じ。
今まで作ったプラグインに関してはまだテストとか書いてないので、これから書く。
追記
decode_content してなかったのを修正。