atom_feed.pl

    1:- module(atom_feed, [ new_feed/2
    2                     , author/2
    3                     , content/2
    4                     , description/2
    5                     , email/2
    6                     , entry/2
    7                     , href/2
    8                     , id/2
    9                     , link/2
   10                     , name/2
   11                     , published/2
   12                     , rel/2
   13                     , summary/2
   14                     , title/2
   15                     , type/2
   16                     , updated/2
   17                     ]
   18         ).   19
   20:- use_module(library(charsio), [open_chars_stream/2]).   21:- use_module(library(delay)).   22:- use_module(library(http/http_open), [http_open/3]).   23:- use_module(library(http/http_ssl_plugin)).  % for SSL
   24:- use_module(library(sgml), [load_sgml/3]).   25:- use_module(library(xpath)).   26
   27
   28:- multifile delay:mode/1.
   29delay:mode(system:atomic_list_concat(ground,ground,_)).
   30delay:mode(system:atomic_list_concat(_,ground,ground)).

new_feed(?Source, -Feed) is det

Read a Feed from Source. Source is one of

stream(Stream) - an input stream
file(File) - a file name to read from
atom(Atom) - Atom or RSS XML in an atom
codes(Codes) - Atom or RSS XML in a list of codes
url(Url) - a URL to fetch

This is the first step in working with an Atom or RSS feed. If an RSS feed has multiple channels, only the first channel is considered.

   45new_feed(stream(Stream), Feed) :-
   46    parse_xml(Stream, Flavor, Tree),
   47    wrap_feed(Flavor, Tree, Feed).
   48new_feed(file(File), Feed) :-
   49    parse_xml(File, Flavor, Tree),
   50    wrap_feed(Flavor, Tree, Feed).
   51new_feed(atom(Atom), Feed) :-
   52    atom_codes(Atom, Codes),
   53    new_feed(codes(Codes), Feed).
   54new_feed(codes(Codes), Feed) :-
   55    open_chars_stream(Codes, Stream),
   56    new_feed(stream(Stream), Feed).
   57new_feed(url(Url), Feed) :-
   58    setup_call_cleanup( http_open(Url, Stream, [ timeout(10)
   59                                               , cert_verify_hook(ssl_verify)
   60                                               ])
   61                      , new_feed(stream(Stream), Feed)
   62                      , close(Stream)
   63                      ).
   64
   65% accept all SSL certificates
   66ssl_verify( _SSL
   67          , _ProblemCertificate
   68          , _AllCertificates
   69          , _FirstCertificate
   70          , _Error
   71          ).
   72
   73
   74% convenience for new_feed/2 when parsing XML
   75parse_xml(Source, Flavor, Tree) :-
   76    load_sgml( Source
   77             , Parts
   78             , [ dialect(xmlns)
   79               , call(urlns, url_ns)
   80               %, call(xmlns, on_xmlns)  % to be notified of xmlns declarations
   81               ]
   82             ),
   83    member(Root-Flavor, [(atom:feed)-atom, rss-rss]),
   84    Tree = element(Root,_Attrs,_Children),
   85    memberchk(Tree, Parts),
   86    !.  % accept the first solution
   87
   88
   89
   90wrap_feed(atom, Tree, atom_feed(Tree)).
   91wrap_feed(rss, RssTree, rss_feed(ChannelTree)) :-
   92    once(xpath(RssTree, /rss/channel, ChannelTree)).
   93
   94
   95% map URLs to namespace prefixes (for convenience)
   96url_ns('http://www.w3.org/2005/Atom', atom, _).
   97
   98
   99% convenience for matching XPath expressions w/o needing matched element
  100xpath(Dom, Spec) :-
  101    xpath(Dom, Spec, _).

id(+Item, -Id:atom) is det: True if Id is the 'id' of Item. Item can be a feed or an entry. RSS feeds don't have IDs.

  108id(atom_feed(Dom), IdText) :-
  109    once(xpath(Dom, /(atom:feed)/(atom:id), Id)),
  110    once(xpath(Id, /'*'(text), IdText)).
  111id(atom_entry(Dom), IdText) :-
  112    once(xpath(Dom, /(atom:entry)/(atom:id), Id)),
  113    once(xpath(Id, /'*'(text), IdText)).
  114id(rss_entry(Dom), IdText) :-
  115    once(xpath(Dom, /item/guid(text), IdText)).

author(+Item, -Author) is nondet: True if Author is the 'author' of Item. Item can be a feed or an entry. An author is a compound item. See name/2 for a predicate that works with this item. RSS feeds don't have an author.

  123author(atom_feed(Dom), atom_author(Author)) :-
  124    xpath(Dom, /(atom:feed)/(atom:author), Author).
  125author(atom_entry(Dom), atom_author(Author)) :-
  126    xpath(Dom, /(atom:entry)/(atom:author), Author).
  127author(rss_entry(Dom), rss_author(Author)) :-
  128    xpath(Dom, /item/author, Author).

content(+Entry, -Content:atom) is semidet: True if Content is the 'content' of Entry. There's currently no way to access the content's type or src attributes.

  135content(atom_entry(Entry), ContentText) :-
  136    once(xpath(Entry, /(atom:entry)/(atom:content), Content)),
  137    once(xpath(Content, /'*'(text), ContentText)).
  138content(rss_entry(E), ContentText) :-
  139    Entry = rss_entry(E),
  140    \+ link(Entry, _),  % no link means description is the content
  141    description(Entry, ContentText).
  142
  143
  144%% description(+Entry, -Description:atom) is semidet.
  145%
  146%  True if Entry has a Description. This predicate is peculiar to RSS
  147%  entries. The RSS spec confounds summary and content into a single
  148%  'description' field.
  149description(rss_entry(Entry), Description) :-
  150    once(xpath(Entry, /item/description(text), Description)).

summary(+Entry, -Summary:atom) is semidet: True if Summary is a summary of Entry. Fails if an entry has no summary or the summary violates standards described in the Atom spec (exact copy the entry's title, etc).
It's not possible to reliably extract summaries from RSS entries. That's because summary and content is confounded into a single 'description' element.

  162summary(atom_entry(Entry), SummaryText) :-
  163    % summary on entry is optional. See RFC4287 4.2.13
  164    once(xpath(Entry, /(atom:entry)/(atom:summary), Summary)),
  165    once(xpath(Summary, /'*'(text), SummaryText)),
  166
  167    % must have text different from 'content' and 'title'
  168    dif(SummaryText, Title),
  169    title(atom_entry(Entry), Title),
  170    ( content(atom_entry(Entry), Content) -> dif(SummaryText, Content) ; true ).

title(+Item, -Title:atom) is det: True if Title is the title of Item. Item can be a feed, entry or link.

  177title(atom_feed(Dom), TitleText) :-
  178    once(xpath(Dom, /(atom:feed)/(atom:title), Title)),
  179    once(xpath(Title, /'*'(text), TitleText)).
  180title(rss_feed(Dom), TitleText) :-
  181    once(xpath(Dom, /channel/title(text), TitleText)).
  182title(atom_entry(Dom), TitleText) :-
  183    xpath(Dom, /(atom:entry)/(atom:title), Title),
  184    once(xpath(Title, /'*'(text), TitleText)).
  185title(rss_entry(Dom), TitleText) :-
  186    once(xpath(Dom, /item/title(text), TitleText)).
  187title(atom_link(Dom), TitleText) :-
  188    Dom = element(_,Attrs,_),
  189    memberchk(title=TitleText, Attrs).

entry(+Feed, -Entry) is nondet: True if Entry is an entry of Feed. Iterates of the feed's entries on backtracking.

  196entry(atom_feed(Dom), atom_entry(Entry)) :-
  197    xpath(Dom, /(atom:feed)/(atom:entry), Entry).
  198entry(rss_feed(Dom), rss_entry(Entry)) :-
  199    xpath(Dom, /channel/item, Entry).

link(+Item, -Link) is nondet

True if Link is a 'link' of Item. Item can be a feed or an entry. Link is a compound item. One typically uses predicates like rel/2 and type/2 to refine which link is desired.

For example, to find the URL to an HTML alternative:

link(Feed, Link),
rel(Link, alternate),
type(Link, text/html),
href(Link, Url).

  214link(atom_feed(Dom), atom_link(Link)) :-
  215    xpath(Dom, /(atom:feed)/(atom:link), Link).
  216link(rss_feed(Dom), rss_link(Link)) :-
  217    once(xpath(Dom, /channel/link, Link)).  % spec says "the" suggesting 1
  218link(atom_entry(Dom), atom_link(Link)) :-
  219    xpath(Dom, /(atom:entry)/(atom:link), Link).
  220link(rss_entry(Dom), rss_link(Link)) :-
  221    xpath(Dom, /item/link, Link).

published(+Entry, -EpochSeconds:float) is semidet: True if Entry was published at time EpochSeconds. EpochSeconds is in the same format as that returned by get_time/1

  228published(atom_entry(Dom), Epoch) :-
  229    once(xpath(Dom, /(atom:entry)/(atom:published), Published)),
  230    xpath(Published, /'*'(text), Date),
  231    parse_time(Date, Epoch).
  232published(rss_entry(Dom),Epoch) :-
  233    once(xpath(Dom, /item/pubDate(text), Date)),    parse_time(Date, Epoch).

updated(+Entry, -EpochSeconds:float) is semidet: True if Entry was last updated at time EpochSeconds. EpochSeconds is in the same format as that returned by get_time/1.

  241updated(atom_entry(Dom), Epoch) :-
  242    once(xpath(Dom, /(atom:entry)/(atom:updated), Updated)),
  243    xpath(Updated, /'*'(text), Date),
  244    parse_time(Date, Epoch).

email(+Author, -Email:atom) is semidet: True if Name is email of Author.

  250email(atom_author(Author), EmailText) :-
  251    once(xpath(Author, /(atom:author)/(atom:email), Email)),
  252    xpath(Email, /'*'(text), EmailText).
  253email(rss_author(Author), Email) :-
  254    once(xpath(Author, /author(text), Email)).

name(+Author, -Name:atom) is semidet: True if Name is the 'name' of Author. According to the Atom spec, this should be a human readable name. RSS authors have no name, only an email. See email/2.

  262name(atom_author(Author), NameText) :-
  263    % name on author is mandatory.  See RFC4287 3.2.1
  264    once(xpath(Author, /(atom:author)/(atom:name), Name)),
  265    xpath(Name, /'*'(text), NameText).

rel(+Link, -Rel) is det: True if Rel is the relationship between Link and its parent. If the Atom XML doesn't explicitly specify a 'rel', alternate is used instead. That's why this predicate's mode is det.
Because RSS links can't specify a 'rel', we use alternate for them too.

  276rel(atom_link(Link), Rel) :-
  277    % rel on link is optional, defaults to "alternate".
  278    % See RFC4287 4.2.7.2
  279    Link = element(_,Attrs,_),
  280    ( memberchk(rel=Rel0,Attrs) ->
  281        true
  282    ; true ->
  283        Rel0 = alternate
  284    ),
  285    Rel = Rel0.
  286rel(rss_link(_), alternate).

href(+Link, -Href:atom) is det: True if Href is the 'href' attribute of Link.

  292href(atom_link(Link), Href) :-
  293    % href on link is mandatory. See RFC4287 4.2.7.1
  294    once(xpath(Link, /'*'(@href=Href))).
  295href(rss_link(Link), Href) :-
  296    Link = element(link, _, [Href]).

type(+Link, -MediaType) is semidet

True if Link has type attributed MediaType. MediaType has the form Type/Subtype which allows one to do things like

type(Link, text/_)  % link has a textual type

The Atom spec says that 'type' is optional. If it's missing, this predicate fails. RSS spec says the link points to "the HTML website", so MediaType is always text/html.

Subtype may contain punctuation so remember to quote: application/'atom+xml'.

  312type(atom_link(element(_,Attrs,_)), Type/Subtype) :-
  313    delay(atomic_list_concat([Type, Subtype], '/', RawType)),
  314
  315    % type on link is optional, with no default.
  316    % See RFC4287 4.2.7.3
  317    memberchk(type=RawType, Attrs).
  318type(rss_link(_), text/html)