1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2016, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(r_grammar,
   36	  [ r_tokens//1,		% -Tokens
   37	    r_token//1,			% -Token
   38	    r_identifier/1		% +Name
   39	  ]).   40:- use_module(library(dcg/basics)).   41:- use_module(library(lists)).

R parser primitives

See also
- https://cran.r-project.org/doc/manuals/r-release/R-lang.html#Parser */
 r_tokens(-Tokens:list)// is nondet
Get R tokens in a non-greedy fashion. Backtracking extracts more tokens from the input. See r_token/1 for getting a single token.
   54r_tokens([]) --> [].
   55r_tokens([H|T]) -->
   56	r_token(H),
   57	r_tokens(T).
 r_identifier(+Atom) is semidet
True if Atom is a valid R identifier name.
   63r_identifier(Atom) :-
   64	atom(Atom),
   65	atom_codes(Atom, Codes),
   66	phrase(r_token(identifier(Atom)), Codes), !.
 r_token(-Token)// is semidet
Get an R token from the input. Defined tokens are:
number(FloatOrInt)
Used for numerical constants.
complex(FloatOrInt)
Used for <number>i.
string(String)
Used for quoted strings.
identifier(Atom)
General identifier (includes function names)
constant(Atom)
Reserved constants (e.g., NULL, NA, ...)
logical(Boolean)
Used for TRUE and FALSE
keyword(Atom)
Used for if, else, etc.
infix(Atom)
Used for %...% infix operators
op(Atom)
Used for +, -, *, etc.
punct(Atom)
Used for the braces {[()]} and the comma (,)
comment(String)
Used for # Comment lines.
   96r_token(Token) -->
   97	blanks,
   98	token(Token).
   99
  100token(Token) --> r_number(N),         !, {number_token(N, Token)}.
  101token(Token) --> r_string(S),         !, {Token = string(S)}.
  102token(Token) --> r_identifier(Id),    !, {identifier_token(Id, Token)}.
  103token(Token) --> r_infix(Id),         !, {Token = infix(Id)}.
  104token(Token) --> r_operator(Id),      !, {Token = op(Id)}.
  105token(Token) --> r_punct(Id),         !, {Token = punct(Id)}.
  106token(Token) --> r_comment(Id),       !, {Token = comment(Id)}.
  107
  108number_token(complex(I), complex(I)) :- !.
  109number_token(N, number(N)) :- !.
 r_number(-Number)// is semidet
  113r_number(Number) -->
  114	r_basic_number(N),
  115	(   "L"
  116	->  { integer(N) -> Number = N; Number is integer(N) /*warning*/ }
  117	;   "i"
  118	->  { Number = complex(N) }
  119	;   { Number = N }
  120	).
  121
  122r_basic_number(N) -->
  123	int_codes(I), !,
  124	(   dot,
  125	    digit(DF0),
  126	    digits(DF)
  127	->  {F = [0'., DF0|DF]}
  128	;   dot
  129	->  {F = `.0`}
  130	;   {F = []}
  131	),
  132	(   exp
  133	->  int_codes(DI),
  134	    {E=[0'e|DI]}
  135	;   {E = []}
  136	),
  137	{ append([I, F, E], Codes),
  138	  number_codes(N, Codes)
  139	}.
  140r_basic_number(N) -->
  141	dot, !,
  142	digit(DF0),
  143	digits(DF),
  144	{F = [0'., DF0|DF]},
  145	(   exp
  146	->  int_codes(DI),
  147	    {E=[0'e|DI]}
  148	;   {E = []}
  149	),
  150	{ append([`0`, F, E], Codes),
  151	  number_codes(N, Codes)
  152	}.
  153r_basic_number(N) --> "0x", !, xinteger(N).
  154r_basic_number(N) --> "0X", !, xinteger(N).
  155
  156
  157int_codes([C,D0|D]) -->
  158	sign(C), !,
  159	digit(D0),
  160	digits(D).
  161int_codes([D0|D]) -->
  162	digit(D0),
  163	digits(D).
  164
  165sign(0'-) --> "-".
  166sign(0'+) --> "+".
  167
  168dot --> ".".
  169
  170exp --> "e".
  171exp --> "E".
 r_string(-String)// is semidet
  175r_string(S) -->
  176	"\"",
  177	r_string_codes(C),
  178	"\"", !,
  179	{ string_codes(S, C) }.
  180r_string(S) -->
  181	"'",
  182	r_string_codes(C),
  183	"'", !,
  184	{ string_codes(S, C) }.
  185
  186
  187r_string_codes([]) --> [].
  188r_string_codes([H|T]) -->
  189	r_string_code(H),
  190	r_string_codes(T).
  191
  192r_string_code(H) --> "\\", !, r_escape(H).
  193r_string_code(H) --> [H].
  194
  195r_escape(H) --> [C], { r_escape(C, H) }, !.
  196r_escape(H) --> "x", !, xdigit(D1), xdigit(D2), {H is D1<<4 + D2}.
  197r_escape(H) --> "u", xdigits(4, H), !.
  198r_escape(H) --> "u{", xdigits(4, H), "}", !.
  199r_escape(H) --> "U", xdigits(8, H), !.
  200r_escape(H) --> "U{", xdigits(8, H), "}", !.
  201r_escape(H) --> digit(D1), {D1 =< 7}, !, odigits(2, D1, H).
  202
  203xdigits(N, V) --> xdigits(N, 0, V).
  204
  205xdigits(0, V, V) --> !.
  206xdigits(N, V0, V) -->
  207	xdigit(D),
  208	{ V1 is V0*16 + D,
  209	  N1 is N - 1
  210	},
  211	xdigits(N1, V1, V).
  212
  213odigits(0, V, V) --> !.
  214odigits(N, V0, V) -->
  215	digit(D), {D =< 7}, !,
  216	{ V1 is V0*8 + D,
  217	  N1 is N - 1
  218	},
  219	odigits(N1, V1, V).
  220odigits(_, V, V) --> [].
  221
  222r_escape(0'\', 0'\').
  223r_escape(0'\", 0'\").
  224r_escape(0'n, 0'\n).
  225r_escape(0'r, 0'\r).
  226r_escape(0't, 0'\t).
  227r_escape(0'b, 0'\b).
  228r_escape(0'a, 0'\a).
  229r_escape(0'f, 0'\f).
  230r_escape(0'v, 0'\v).
  231r_escape(0'\\, 0'\\).
 r_identifier(-Identifier)
Recognise an R identifier. This includes reserved terms and constants. These are filtered later.
  238r_identifier(Identifier) -->
  239	r_identifier_code(C0),
  240	\+ { no_identifier_start(C0) },
  241	r_identifier_codes(L),
  242	{ \+ ( C0 == 0'., L = [C1|_], code_type(C1, digit) ),
  243          atom_codes(Identifier, [C0|L])
  244	}.
  245
  246r_identifier_code(C) -->
  247	[C],
  248	{   code_type(C, alnum)
  249	->  true
  250	;   C == 0'_
  251	->  true
  252	;   C == 0'.
  253	}.
  254
  255r_identifier_codes([H|T]) -->
  256	r_identifier_code(H), !,
  257	r_identifier_codes(T).
  258r_identifier_codes([]) --> [].
  259
  260no_identifier_start(C) :- code_type(C, digit).
  261no_identifier_start(0'_).
 identifier_token(+Id, -Token) is det
  265identifier_token(Id, Token) :-
  266	(   reserved_identifier(Id, Token)
  267	->  true
  268	;   sub_atom(Id, 0, _, _, '..'),
  269	    sub_atom(Id, 2, _, 0, After),
  270	    (	After == '.'
  271	    ->	true
  272	    ;	atom_number(After, N),
  273		integer(N)
  274	    )
  275	->  Token = reserved(Id)
  276	;   Token = identifier(Id)
  277	).
  278
  279reserved_identifier('NULL',	     constant('NULL')).
  280reserved_identifier('NA',	     constant('NA')).
  281reserved_identifier('NA_integer_',   constant('NA_integer_')).
  282reserved_identifier('NA_real_',	     constant('NA_real_')).
  283reserved_identifier('NA_complex_',   constant('NA_complex_')).
  284reserved_identifier('NA_character_', constant('NA_character_')).
  285reserved_identifier('TRUE',	     logical(true)).
  286reserved_identifier('FALSE',	     logical(false)).
  287reserved_identifier(if,		     keyword(if)).
  288reserved_identifier(else,	     keyword(else)).
  289reserved_identifier(repeat,	     keyword(repeat)).
  290reserved_identifier(while,	     keyword(while)).
  291reserved_identifier(function,	     keyword(function)).
  292reserved_identifier(for,	     keyword(for)).
  293reserved_identifier(in,		     keyword(in)).
  294reserved_identifier(next,	     keyword(next)).
  295reserved_identifier(break,	     keyword(break)).
  296reserved_identifier('Inf',           number(Inf)) :- Inf is inf.
  297reserved_identifier('NaN',           number(NaN)) :- NaN is nan.
 r_infix(-Infix)//
  303r_infix(Id) -->
  304	"%", non_blanks_short(Chars), "%", !,
  305	{ append([`%`, Chars, `%`], All),
  306	  atom_codes(Id, All)
  307	}.
  308
  309non_blanks_short([]) --> [].
  310non_blanks_short([H|T]) --> nonblank(H), non_blanks_short(T).
 r_operator(-Op)//
  316r_operator(->)	 --> "->".
  317r_operator(+)	 --> "+".
  318r_operator(-)	 --> "-".
  319r_operator(*)	 --> "*".
  320r_operator(/)	 --> "/".
  321r_operator('%%') --> "%%".
  322r_operator(^)	 --> "^".
  323
  324r_operator(>=)	 --> ">=".
  325r_operator(>)	 --> ">".
  326r_operator(<=)	 --> "<=".
  327r_operator(<-)	 --> "<-".
  328r_operator(<)	 --> "<".
  329r_operator(==)	 --> "==".
  330r_operator('!=') --> "!=".
  331
  332r_operator(!)	 --> "!".
  333r_operator(&)	 --> "&".
  334r_operator('|')	 --> "|".
  335r_operator(~)	 --> "~".
  336r_operator($)	 --> "$".
  337r_operator(:)	 --> ":".
  338r_operator(=)	 --> "=".
  339
  340r_punct('(') --> "(".
  341r_punct(')') --> ")".
  342r_punct('{') --> "{".
  343r_punct('}') --> "}".
  344r_punct('[') --> "[".
  345r_punct(']') --> "]".
  346r_punct(',') --> ",".
  347
  348r_comment(String) -->
  349	"#", string(Codes), eol, !,
  350	{ string_codes(String, Codes) }.
  351
  352:- if(\+ current_predicate(eol//0)).  353% part of library(dcg/basics) from 8.3.26.
  354eol --> "\r".
  355eol --> "\n".
  356eol --> eos.
  357:- endif.