scrapes.pl

    1:- module(html_scrapes, [scrape_row/2]).    2
    3:- use_module(library(sgml)).

scrape_row(+URL, -Row) is nondet: Scrapes all table rows non-deterministically by row within each table. Tables must have table headers, thead elements.
Scrapes distinct rows. Distinct is important because HTML documents contain tables within tables within tables. Attempts to permit some flexibility. Asking for sub-rows finds head sub-rows; catches and filters out by disunifying data with heads.

   15scrape_row(URL, Row) :-
   16    distinct(Row, scrape_row_(URL, Row)).
   17
   18scrape_row_(URL, Row) :-
   19    load_html(URL, DOM, []),
   20    xpath(DOM, //(table), Table),
   21    findall(Head, xpath(Table, //(thead)/tr/td(normalize_space), Head), Heads),
   22    xpath(Table, //(tr), TR),
   23    findall(Datum, xpath(TR, //(td(normalize_space)), Datum), Data),
   24    Data \== Heads,
   25    scrape_row__(Heads, Data, Columns),
   26    Row =.. [row|Columns].
   27
   28scrape_row__([], [], []).
   29scrape_row__([Head0|Heads], [Datum|Data], [Column|Columns]) :-
   30    restyle_identifier(one_two, Head0, Head),
   31    Column =.. [Head, Datum],
   32    scrape_row__(Heads, Data, Columns)