/*  Part of SWI-Prolog

    Author:        Jan Wielemaker
    E-mail:        J.Wielemaker@vu.nl
    WWW:           http://www.swi-prolog.org
    Copyright (c)  2012-2023, University of Amsterdam
                              VU University Amsterdam
                              SWI-Prolog Solutions b.v.
    All rights reserved.

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
    are met:

    1. Redistributions of source code must retain the above copyright
       notice, this list of conditions and the following disclaimer.

    2. Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    POSSIBILITY OF SUCH DAMAGE.
*/

:- module(dcg_basics,
          [ white//0,                   % <white inside line>
            whites//0,                  % <white inside line>*
            blank//0,                   % <blank>
            blanks//0,                  % <blank>*
            nonblank//1,                % <nonblank>
            nonblanks//1,               % <nonblank>* --> chars         (long)
            blanks_to_nl//0,            % [space,tab,ret]*nl
            string//1,                  % <any>* -->chars               (short)
            string_without//2,          % Exclude, -->chars             (long)
                                        % Characters
            alpha_to_lower//1,          % Get lower|upper, return lower
                                        % Decimal numbers
            digits//1,                  % [0-9]* -->chars
            digit//1,                   % [0-9] --> char
            integer//1,                 % [+-][0-9]+ --> integer
            float//1,                   % [+-]?[0-9]+(.[0-9]*)?(e[+-]?[0-9]+)? --> float
            number//1,                  % integer | float
                                        % Hexadecimal numbers
            xdigits//1,                 % [0-9A-Fa-f]* --> 0-15*
            xdigit//1,                  % [0-9A-Fa-f] --> 0-15
            xinteger//1,                % [0-9A-Fa-f]+ --> integer

            prolog_var_name//1,         % Read a Prolog variable name
            csym//1,                    % Read a C symbol

            eol//0,                     % End of line
            eos//0,                     % Test end of input.
            remainder//1,               % -List

                                        % generation (TBD)
            atom//1                     % generate atom
          ]).
:- use_module(library(lists)).
:- use_module(library(error)).


/** <module> Various general DCG utilities

This library provides various commonly  used   DCG  primitives acting on
list  of  character  *codes*.  Character   classification  is  based  on
code_type/2.

This module started its life as  library(http/dcg_basics) to support the
HTTP protocol. Since then, it was increasingly  used in code that has no
relation to HTTP and therefore  this  library   was  moved  to  the core
library.

@tbd    This is just a starting point. We need a comprehensive set of
        generally useful DCG primitives.
*/

%!  string_without(+EndCodes, -Codes)// is det.
%
%   Take as many codes from the input  until the next character code
%   appears in the list EndCodes.  The   terminating  code itself is
%   left on the input.  Typical  use  is   to  read  upto  a defined
%   delimiter such as a newline  or   other  reserved character. For
%   example:
%
%       ==
%           ...,
%           string_without("\n", RestOfLine)
%       ==
%
%   @arg EndCodes is a list of character codes.
%   @see string//1.

string_without(End, Codes) -->
    { string(End),
      !,
      string_codes(End, EndCodes)
    },
    list_string_without(EndCodes, Codes).
string_without(End, Codes) -->
    list_string_without(End, Codes).

list_string_without(Not, [C|T]) -->
    [C],
    { \+ memberchk(C, Not)
    },
    !,
    list_string_without(Not, T).
list_string_without(_, []) -->
    [].

%!  string(-Codes)// is nondet.
%
%   Take as few as possible tokens from the input, taking one more
%   each time on backtracking. This code is normally followed by a
%   test for a delimiter.  For example:
%
%   ==
%   upto_colon(Atom) -->
%           string(Codes), ":", !,
%           { atom_codes(Atom, Codes) }.
%   ==
%
%   @see string_without//2.

string([]) -->
    [].
string([H|T]) -->
    [H],
    string(T).

%!  blanks// is det.
%
%   Skip zero or more white-space characters.

blanks -->
    blank,
    !,
    blanks.
blanks -->
    [].

%!  blank// is semidet.
%
%   Take next =space= character from input. Space characters include
%   newline.
%
%   @see white//0

blank -->
    [C],
    { nonvar(C),
      code_type(C, space)
    }.

%!  nonblanks(-Codes)// is det.
%
%   Take all =graph= characters

nonblanks([H|T]) -->
    [H],
    { code_type(H, graph)
    },
    !,
    nonblanks(T).
nonblanks([]) -->
    [].

%!  nonblank(-Code)// is semidet.
%
%   Code is the next non-blank (=graph=) character.

nonblank(H) -->
    [H],
    { code_type(H, graph)
    }.

%!  blanks_to_nl// is semidet.
%
%   Take a sequence of blank//0 codes if blanks are followed by a
%   newline or end of the input.

blanks_to_nl -->
    "\n",
    !.
blanks_to_nl -->
    blank,
    !,
    blanks_to_nl.
blanks_to_nl -->
    eos.

%!  whites// is det.
%
%   Skip white space _inside_ a line.
%
%   @see blanks//0 also skips newlines.

whites -->
    white,
    !,
    whites.
whites -->
    [].

%!  white// is semidet.
%
%   Take next =white= character from input. White characters do
%   _not_ include newline.

white -->
    [C],
    { nonvar(C),
      code_type(C, white)
    }.


                 /*******************************
                 *       CHARACTER STUFF        *
                 *******************************/

%!  alpha_to_lower(?C)// is semidet.
%
%   Read a letter (class  =alpha=)  and   return  it  as a lowercase
%   letter. If C is instantiated and the  DCG list is already bound,
%   C must be =lower= and matches both a lower and uppercase letter.
%   If the output list is unbound, its first element is bound to C.
%   For example:
%
%     ==
%     ?- alpha_to_lower(0'a, `AB`, R).
%     R = [66].
%     ?- alpha_to_lower(C, `AB`, R).
%     C = 97, R = [66].
%     ?- alpha_to_lower(0'a, L, R).
%     L = [97|R].
%     ==

alpha_to_lower(L) -->
    [C],
    {   nonvar(C)
    ->  code_type(C, alpha),
        code_type(C, to_upper(L))
    ;   L = C
    }.


                 /*******************************
                 *            NUMBERS           *
                 *******************************/

%!  digits(?Chars)// is det.
%!  digit(?Char)// is det.
%!  integer(?Integer)// is det.
%
%   Number processing. The predicate  digits//1   matches a possibly
%   empty set of digits,  digit//1  processes   a  single  digit and
%   integer processes an  optional  sign   followed  by  a non-empty
%   sequence of digits into an integer.

digits([H|T]) -->
    digit(H),
    !,
    digits(T).
digits([]) -->
    [].

digit(C) -->
    [C],
    { code_type(C, digit)
    }.

integer(I, Head, Tail) :-
    nonvar(I),
    !,
    format(codes(Head, Tail), '~d', [I]).
integer(I) -->
    int_codes(Codes),
    { number_codes(I, Codes)
    }.

int_codes([C,D0|D]) -->
    sign(C),
    !,
    digit(D0),
    digits(D).
int_codes([D0|D]) -->
    digit(D0),
    digits(D).


%!  float(?Float)// is det.
%
%   Process a floating  point  number.   The  actual  conversion  is
%   controlled by number_codes/2.

float(F, Head, Tail) :-
    float(F),
    !,
    with_output_to(codes(Head, Tail), write(F)).
float(F) -->
    number(F),
    { float(F) }.

%!  number(+Number)// is det.
%!  number(-Number)// is semidet.
%
%   Generate extract a number. Handles   both  integers and floating
%   point numbers.

number(N, Head, Tail) :-
    number(N),
    !,
    format(codes(Head, Tail), '~w', N).
number(N) -->
    { var(N)
    },
    !,
    int_codes(I),
    (   dot,
        digit(DF0),
        digits(DF)
    ->  {F = [0'., DF0|DF]}
    ;   {F = []}
    ),
    (   exp
    ->  int_codes(DI),
        {E=[0'e|DI]}
    ;   {E = []}
    ),
    { append([I, F, E], Codes),
      number_codes(N, Codes)
    }.
number(N) -->
    { type_error(number, N) }.

sign(0'-) --> "-".
sign(0'+) --> "+".

dot --> ".".

exp --> "e".
exp --> "E".

                 /*******************************
                 *          HEX NUMBERS         *
                 *******************************/

%!  xinteger(+Integer)// is det.
%!  xinteger(-Integer)// is semidet.
%
%   Generate or extract an integer from   a  sequence of hexadecimal
%   digits. Hexadecimal characters include both  uppercase (A-F) and
%   lowercase (a-f) letters. The value may   be  preceded by  a sign
%   (+/-)

xinteger(Val, Head, Tail) :-
    integer(Val),
    !,
    format(codes(Head, Tail), '~16r', [Val]).
xinteger(Val) -->
    sign(C),
    !,
    xdigit(D0),
    xdigits(D),
    { mkval([D0|D], 16, Val0),
      (   C == 0'-
      ->  Val is -Val0
      ;   Val = Val0
      )
    }.
xinteger(Val) -->
    xdigit(D0),
    xdigits(D),
    { mkval([D0|D], 16, Val)
    }.

%!  xdigit(-Weight)// is semidet.
%
%   True if the next code is a  hexdecimal digit with Weight. Weight
%   is  between  0  and  15.  Hexadecimal  characters  include  both
%   uppercase (A-F) and lowercase (a-f) letters.

xdigit(D) -->
    [C],
    { code_type(C, xdigit(D))
    }.

%!  xdigits(-WeightList)// is det.
%
%   List of weights of a sequence   of hexadecimal codes. WeightList
%   may be empty. Hexadecimal  characters   include  both  uppercase
%   (A-F) and lowercase (a-f) letters.

xdigits([D0|D]) -->
    xdigit(D0),
    !,
    xdigits(D).
xdigits([]) -->
    [].

mkval([W0|Weights], Base, Val) :-
    mkval(Weights, Base, W0, Val).

mkval([], _, W, W).
mkval([H|T], Base, W0, W) :-
    W1 is W0*Base+H,
    mkval(T, Base, W1, W).


                 /*******************************
                 *         END-OF-STRING        *
                 *******************************/

%!  eol//
%
%   Matches end-of-line. Matching \r\n, \n or end of input (eos//0).

eol --> "\n", !.
eol --> "\r\n", !.
eol --> eos.

%!  eos//
%
%   Matches  end-of-input.  The  implementation    behaves   as  the
%   following portable implementation:
%
%     ==
%     eos --> call(eos_).
%     eos_([], []).
%     ==
%
%   @tbd    This is a difficult concept and violates the _context free_
%           property of DCGs.  Explain the exact problems.

eos([], []).

%!  remainder(-List)//
%
%   Unify List with the remainder of the input.

remainder(List, List, []).


                 /*******************************
                 *         PROLOG SYNTAX                *
                 *******************************/

%!  prolog_var_name(-Name:atom)// is semidet.
%
%   Matches a Prolog variable name. Primarily  intended to deal with
%   quasi quotations that embed Prolog variables.

prolog_var_name(Name) -->
    [C0], { code_type(C0, prolog_var_start) },
    !,
    prolog_id_cont(CL),
    { atom_codes(Name, [C0|CL]) }.

prolog_id_cont([H|T]) -->
    [H], { code_type(H, prolog_identifier_continue) },
    !,
    prolog_id_cont(T).
prolog_id_cont([]) --> "".


                 /*******************************
                 *          IDENTIFIERS         *
                 *******************************/

%!  csym(?Symbol:atom)// is semidet.
%
%   Recognise a C symbol according to   the  `csymf` and `csym` code
%   type classification provided by the C library.

csym(Name, Head, Tail) :-
    nonvar(Name),
    format(codes(Head, Tail), '~w', [Name]).
csym(Name) -->
    [F], {code_type(F, csymf)},
    csyms(Rest),
    { atom_codes(Name, [F|Rest]) }.

csyms([H|T]) -->
    [H], {code_type(H, csym)},
    !,
    csyms(T).
csyms([]) -->
    "".


                 /*******************************
                 *           GENERATION         *
                 *******************************/

%!  atom(++Atom)// is det.
%
%   Generate codes of Atom.  Current implementation uses write/1,
%   dealing with any Prolog term.  Atom must be ground though.

atom(Atom, Head, Tail) :-
    must_be(ground, Atom),
    format(codes(Head, Tail), '~w', [Atom]).