1:- encoding(utf8).
    2:- module(
    3  rdf_guess,
    4  [
    5    rdf_guess_file/3,   % +FileSpec, +Size, -MediaType
    6    rdf_guess_stream/3, % +In, +Size, -MediaType
    7    rdf_guess_string/2  % +String, -MediaType
    8  ]
    9).

RDF guess

This module heuristically guesses the RDF serialization format that is stored in the input stream In.

RDF/XML can be distinguished from Turtle-family serializations, because it is not possible to define valid RDF/XML without declaring XML namespaces.

*/

   22:- use_module(library(sgml)).   23:- use_module(library(yall)).   24
   25:- use_module(library(archive_ext)).   26:- use_module(library(dcg)).   27:- use_module(library(file_ext)).   28:- use_module(library(media_type)).   29:- use_module(library(rdf_prefix)).   30
   31:- meta_predicate
   32    n3_lexical_form_codes(//, ?, ?).
 rdf_guess_file(+FileSpec:term, +Size:positiveInteger, -MediaType:media_type) is semidet
   42rdf_guess_file(Spec, Size, MediaType) :-
   43  read_from_file(
   44    Spec,
   45    {Size,MediaType}/[In0]>>rdf_guess_stream(In0, Size, MediaType)
   46  ).
 rdf_guess_stream(+In:istream, +Size:nonneg, -MediaType:media_type) is semidet
Arguments:
Size- is the number of codes that is read from the input stream In, on which the guess is based. This number is doubled while backtracking, until either the end of the stream is reached or the maximum peek size, as indicated by the input stream `In', is exceeded.
MediaType- is a compound term of the form `media(Supertype/Subtype,Params)'. This is how Media Types are represented in the HTTP package (see http_parse_header_value/3).

There is one JSON-family Media Type:

  • media(application/ld+json,[]) for JSON-LD

There are two/four Turtle-family Media Types:

  • media(application/trig,[]) for TriG This includes the Media Type for Turtle, i.e., `media(text/turtle,[])'.
  • media(application/'n-nquads',[]) for N-Quads This includes the Media Type for N-Triples, i.e., `media(application/'n-triples',[])'.

There are two SGML-family Media Types that denote RDF:

  • media(application/'rdf+xml',[]) for RDF/XML
  • media(application/'xhtml+xml',[]) for RDFa
  • media(text/html,_) for RDFa
   87rdf_guess_stream(In, Size, MediaType) :-
   88  peek_string(In, Size, String),
   89  rdf_guess_string(String, MediaType).
 rdf_guess_string(+String:string, -MediaType:media_type) is semidet
   95rdf_guess_string(String, MediaType) :-
   96  rdf_guess_string_(String, Ext),
   97  media_type_extension(MediaType, Ext).
   98
   99rdf_guess_string_(String, jsonld) :-
  100  string_phrase(jsonld_format, String), !.
  101rdf_guess_string_(String, Ext) :-
  102  % We use the information as to whether or not the end of the stream
  103  % has been reached.
  104  string_phrase(n3_format(Ext), String, _), !.
  105rdf_guess_string_(String, Ext) :-
  106  setup_call_cleanup(
  107    open_string(String, In),
  108    sgml_format(In, Ext),
  109    close(In)
  110  ).
  111
  112
  113
  114% JSON-LD %
 jsonld_format//
It is not clear to me whether a JSON document is a single array, a single object, or a sequence of arrays and objects. The JSON grammars that I could find online only define JSON terms, not JSON documents.

Whatever the case may be, if a document start with the square opening bracket, it can be neither belong to the Turtle- nor to the SGML-family, so it must be JSON-LD.

If a document start with a curly opening bracket, it can still be TriG or JSON-LD. However, if the opening curly bracket is directly followed by a JSON-conformant string and a colon (the key/value separator in JSON).

  132jsonld_format -->
  133  blanks,
  134  optional_arrays,
  135  "{",
  136  blanks,
  137  json_string,
  138  blanks,
  139  ":", !,
  140  remainder(_).
  141
  142optional_arrays -->
  143  "[", !,
  144  blanks,
  145  optional_arrays.
  146optional_arrays --> "".
  147
  148% JSON strings start with a double quote.
  149json_string -->
  150  "\"",
  151  json_string_rest.
  152
  153% JSON strings may contain escaped double quotes.
  154json_string_rest -->
  155  "\\\"", !,
  156  json_string_rest.
  157% An unescaped double quote ends a JSON string.
  158json_string_rest -->
  159  "\"", !.
  160% Skip other JSON string content.
  161json_string_rest -->
  162  [_], !,
  163  json_string_rest.
  164
  165
  166
  167% N3 FAMILY %
 n3_format(-Extension:oneof([nq,trig]))// is semidet
Succeeds on a list of codes that match the beginning of a document in the Turtle-family.
  174% skip blanks
  175n3_format(Ext) -->
  176  n3_blank, !,
  177  n3_blanks,
  178  n3_format(Ext).
  179% N-Quads, N-Triples, TriG, Turtle comment
  180n3_format(Ext) -->
  181  n3_comment, !,
  182  n3_format(Ext).
  183% Turtle, TriG base or prefix declaration
  184n3_format(trig) -->
  185  ("@base" ; "base" ; "@prefix" ; "prefix"), !.
  186% TriG default graph
  187n3_format(trig) -->
  188  "{", !.
  189% N-Quads, N-Triples TriG, Turtle triple or quadruple
  190n3_format(Ext) -->
  191  n3_subject(Ext),
  192  n3_blanks,
  193  n3_predicate(Ext),
  194  n3_blanks,
  195  n3_object(Ext),
  196  n3_blanks,
  197  (   % end of a triple
  198      "."
  199  ->  ""
  200  ;   % TriG, Turtle object list notation
  201      ";"
  202  ->  {Ext = trig}
  203  ;   % TriG, Turtle predicate-object pairs list notation
  204      ","
  205  ->  {Ext = trig}
  206  ;   % N-Quads end of a quadruple
  207      n3_graph(Ext),
  208      n3_blanks,
  209      "."
  210  ->  {Ext = nq}
  211  ), !,
  212  {(var(Ext) -> Ext = nq ; true)}.
  213% TriG, Turtle anonymous blank node
  214n3_format(trig) -->
  215  "[", !.
  216% TriG, Turtle collection
  217n3_format(trig) -->
  218  "(", !.
  219% TriG named graph
  220n3_format(trig) -->
  221  n3_graph(_),
  222  n3_blanks,
  223  "{", !.
  224
  225% N3 only allows horizontal tab and space, but we skip other blank
  226% characters as well, since they may appear in non-conforming
  227% documents without telling us anything about which N3 subtype we are
  228% parsing.
  229n3_blank -->
  230  blank.
  231n3_blank -->
  232  n3_comment.
  233
  234n3_blanks -->
  235  n3_blank, !,
  236  n3_blanks.
  237n3_blanks --> "".
  238
  239n3_bnode -->
  240  "_:",
  241  nonblanks.
  242
  243n3_comment -->
  244  "#",
  245  string(_),
  246  (eol ; eos).
  247
  248eol --> "\n".
  249eol --> "\r\n".
  250
  251n3_graph(Ext) -->
  252  n3_iriref(Ext).
  253
  254% N-Quads, N-Triples, TriG, Turtle full IRI
  255n3_iriref(_) -->
  256  "<", !,
  257  n3_iriref0.
  258% TriG, Turtle prefixed IRI
  259n3_iriref(trig) -->
  260  n3_iriref_prefix,
  261  ":",
  262  nonblanks.
  263
  264n3_iriref0 -->
  265  ">", !.
  266n3_iriref0 -->
  267  "\\", !,
  268  (   "u"
  269  ->  xdigit(_), xdigit(_), xdigit(_), xdigit(_)
  270  ;   "U"
  271  ->  xdigit(_), xdigit(_), xdigit(_), xdigit(_), xdigit(_), xdigit(_), xdigit(_), xdigit(_)
  272  ),
  273  n3_iriref0.
  274n3_iriref0 -->
  275  [Code],
  276  {\+ non_iri_code(Code)},
  277  n3_iriref0.
  278
  279non_iri_code(Code) :-
  280  between(0x0, 0x20, Code).
  281non_iri_code(0'<).
  282non_iri_code(0'>).
  283non_iri_code(0'").%"
  284non_iri_code(0'{).
  285non_iri_code(0'}).
  286non_iri_code(0'|).
  287non_iri_code(0'^).
  288non_iri_code(0'`).
  289non_iri_code(0'\\).
  290
  291n3_iriref_prefix -->
  292  ":", !,
  293  {fail}.
  294n3_iriref_prefix -->
  295  blank, !,
  296  {fail}.
  297n3_iriref_prefix -->
  298  nonblank, !,
  299  n3_iriref_prefix.
  300n3_iriref_prefix --> "".
  301
  302% TriG, Turtle lexical form with triple single quotes
  303n3_lexical_form(trig) -->
  304  "'''", !,
  305  n3_lexical_form_codes([0'',0'',0'']).
  306% TriG, Turtle lexical form with single single quotes
  307n3_lexical_form(trig) -->
  308  "'", !,
  309  n3_lexical_form_codes([0'']).
  310% TriG, Turtle lexical form with triple double quotes
  311n3_lexical_form(trig) -->
  312  "\"\"\"", !,
  313  n3_lexical_form_codes([0'",0'",0'"]).
  314% N-Quads, N-Triples, TriG, Turtle lexical form with single double
  315% quotes
  316n3_lexical_form(_) -->
  317  "\"",
  318  n3_lexical_form_codes([0'"]).
  319
  320% Escaped single quote.
  321n3_lexical_form_codes(End) -->
  322  "\\\'", !,
  323  n3_lexical_form_codes(End).
  324% Escaped double quote.
  325n3_lexical_form_codes(End) -->
  326  "\\\"", !,
  327  n3_lexical_form_codes(End).
  328% End of string.
  329n3_lexical_form_codes(End) -->
  330  End, !.
  331% Content.
  332n3_lexical_form_codes(End) -->
  333  [_], !,
  334  n3_lexical_form_codes(End).
  335% End of stream.
  336n3_lexical_form_codes(_) --> "".
  337
  338% TriG, Turtle abbreviated form for XSD boolean, decimal, double, and
  339% integer literals
  340n3_literal(trig) -->
  341  ("false" ; "true" ; "+" ; "-" ; "." ; digit(_)), !.
  342n3_literal(Ext) -->
  343  n3_lexical_form(Ext),
  344  n3_blanks,
  345  (   % Literal with an explicit datatype IRI (previously: typed
  346      % literal).
  347      "^^"
  348  ->  n3_blanks,
  349      n3_iriref(Ext)
  350  ;   % Language-tagged string (datatype IRI `rdf:langString').
  351      "@"
  352  ->  n3_ltag
  353  ;   % Abbreviated notation for `xsd:string' (previously: simple
  354      % literal).
  355      ""
  356  ).
  357
  358n3_ltag -->
  359  nonblanks.
  360
  361n3_object(Ext) -->
  362  n3_iriref(Ext), !.
  363n3_object(_) -->
  364  n3_bnode, !.
  365n3_object(Ext) -->
  366  n3_literal(Ext).
  367
  368n3_predicate(Ext) -->
  369  n3_iriref(Ext), !.
  370% TriG, Turtle abbreviation for `rdf:type'
  371n3_predicate(trig) -->
  372  "a".
  373
  374n3_subject(Ext) -->
  375  n3_iriref(Ext), !.
  376n3_subject(_) -->
  377  n3_bnode.
  378
  379
  380
  381% SGML FAMILY %
 sgml_format(+In:istream, -Extension:atom) is semidet
Try to see whether the document is some form of HTML or XML and in particular whether it is RDF/XML. The latter is basically impossible because it is not obligatory for an RDF/XML document to have an rdf:RDF top level element, and when using a typed node, just about anything can qualify for RDF. The only real demand is that the XML document must use XML namespaces, because these are both required to define `rdf:Description' and a valid type IRI from a typed node.

If the toplevel element is detected as HTML we guess that the document contains RDFa.

  397sgml_format(In, Ext) :-
  398  sgml_doctype(In, Dialect, DocType, Attributes),
  399  doc_content_type(Dialect, DocType, Attributes, Ext).
 sgml_doctype(+In:istream, -Dialect:atom, -Doctype:atom, -Attributes:list(compound)) is semidet
Parse a repositional stream and get the name of the first SGML element and demand that this element defines XML namespaces. Fails if the document is illegal SGML before the first element.

Note that it is not possible to define valid RDF/XML without namespaces, while it is not possible to define a valid absolute Turtle IRI (using <...>-notation) with a valid xmlns declaration.

  415sgml_doctype(In, Dialect, DocType, Attributes) :-
  416  setup_call_cleanup(
  417    make_parser(In, Parser, State),
  418    catch(
  419      sgml_parse(
  420        Parser,
  421        [
  422          call(begin, on_begin),
  423          call(cdata, on_cdata),
  424          call(decl, on_declaration),
  425          max_errors(-1),
  426          source(In),
  427          syntax_errors(quiet)
  428        ]
  429      ),
  430      E,
  431      true
  432    ),
  433    clean_parser(In, Parser, State)
  434  ),
  435  nonvar(E),
  436  E = tag(Dialect, DocType, Attributes).
  437
  438make_parser(In, Parser, state(Position)):-
  439  stream_property(In, position(Position)),
  440  new_sgml_parser(Parser, []).
  441
  442clean_parser(In, Parser, state(Position)):-
  443  free_sgml_parser(Parser),
  444  set_stream_position(In, Position).
  445
  446on_begin(Tag, Attributes, Parser) :-
  447  get_sgml_parser(Parser, dialect(Dialect)),
  448  throw(tag(Dialect, Tag, Attributes)).
  449
  450on_cdata(_, _) :-
  451  throw(error(cdata)).
  452
  453on_declaration(Text, Parser) :-
  454  atomic_list_concat(Components, ' ', Text),
  455  Components = ['DOCTYPE',Doctype],
  456  get_sgml_parser(Parser, dialect(Dialect)),
  457  throw(tag(Dialect, Doctype, [])).
 doc_content_type(+Dialect:atom, +Doctype:atom, +Attributes:list(compound), -Extension:atom) is det
  463doc_content_type(_, html, _, html) :- !.
  464doc_content_type(html, _, _, html) :- !.
  465doc_content_type(xhtml, _, _, xhtml) :- !.
  466doc_content_type(html5, _, _, html) :- !.
  467doc_content_type(xhtml5, _, _, xhtml) :- !.
  468doc_content_type(xml, rss, _, rdf) :- !.
  469doc_content_type(Dialect, Top,  Attributes, rdf) :-
  470  % Extract the namespace from the doctype.
  471  dialect_local_name(Dialect, LocalName),
  472  atomic_list_concat([NS,LocalName], :, Top),
  473
  474  % Look up the RDF namespace in the attributes list.
  475  atomic_list_concat([xmlns,NS], :, Attribute),
  476  memberchk(Attribute=RDFNS, Attributes),
  477
  478  % Ensure it is indeed the RDF namespace.
  479  rdf_prefix(rdf, RDFNS).
  480
  481dialect_local_name(sgml, rdf).
  482dialect_local_name(xml, 'RDF')