tokenize.pl

    1:- module(tokenize, [token_stream_of/2, detokenize/2]).    2/*
    3             A tokenizer
    4	     converts an input string into a list of tokens.
    5    Stuff that's a token:
    6    any C language symbol is the token word('foo73_bar')
    7    any sequence of digits that didn't get sucked into a C symbol is number(42)
    8    any other non whitespace is special('&')
    9*/
   10
   11% unifies if NewSoFar is SoFar with the token created from Word appended
   12% to it
   13% word is single character atoms of the word in reverse order
   14add_word(Word, SoFar, [word(Atom)|SoFar]) :-
   15	reverse(Word, WordLR),
   16	atomic_list_concat(WordLR, Atom).
   17
   18add_num(Word, SoFar, [number(Val)|SoFar]) :-
   19	base_ten_value(Word, 1 , 0, Val).
   20
   21base_ten_value([], _ , X, X).
   22base_ten_value([C|T], Place, ValSoFar, Val) :-
   23	char_type(C, digit(V)),
   24	NPlace is 10 * Place,
   25	NValSoFar is ValSoFar + Place * V,
   26	base_ten_value(T, NPlace, NValSoFar, Val).
   27
   28add_special(C, SoFar, [special(C)|SoFar]).
   29
   30% unifies if InToken is the token list of the input
   31% tokenize/5
   32%     -State, this is a state machine lexer, this is the state name
   33%     -Chars, remaining chars as a list of one char
   34%      atoms
   35%     -PartialToken, list of chars accumulated for this token in
   36%      backwards order
   37%     -TokensSoFar,  list of tokens accumulated so far
   38%     +InToken)       final output list of tokens
   39%
   40%     done
   41tokenize(_, [], [], Tokens, Tokens) :- !.
   42% done with part of a word
   43tokenize(in_word, [], Word, SoFar, Tokens) :-
   44	 add_word(Word, SoFar, Tokens).
   45tokenize(in_num, [], Word, SoFar, Tokens) :-
   46	add_num(Word, SoFar, Tokens).
   47
   48%all processing once we're in a word
   49tokenize(in_word, [C|T], Word, SoFar, Tokens) :-
   50	char_type(C, csym),
   51	tokenize(in_word, T, [C|Word], SoFar, Tokens).
   52tokenize(in_word, [C|T], Word, SoFar, Tokens) :-
   53	add_word(Word, SoFar, NewSoFar),
   54	tokenize(not_in_word, [C|T], [], NewSoFar, Tokens).
   55
   56%all processing once we're in a number
   57tokenize(in_num, [C|T], Word, SoFar, Tokens) :-
   58	char_type(C, digit),
   59	tokenize(in_num, T, [C|Word], SoFar, Tokens).
   60tokenize(in_num, [C|T], Word, SoFar, Tokens) :-
   61	add_num(Word, SoFar, NewSoFar),
   62	tokenize(not_in_word, [C|T], [], NewSoFar, Tokens).
   63
   64% encounter a csymf, start a word
   65tokenize(not_in_word, [C|T], [], SoFar, Tokens) :-
   66	char_type(C, csymf),
   67	tokenize(in_word, T, [C], SoFar, Tokens).
   68
   69% encounter a digit, start a number
   70tokenize(not_in_word, [C|T], [], SoFar, Tokens) :-
   71	char_type(C, digit),
   72	tokenize(in_num, T, [C], SoFar, Tokens).
   73
   74% eject whitespace
   75tokenize(not_in_word, [C|T], [], SoFar, Tokens) :-
   76	char_type(C, space),
   77	tokenize(not_in_word, T, [], SoFar, Tokens).
   78
   79% handle as a special
   80tokenize(not_in_word, [C|T], [], SoFar, Tokens) :-
   81	add_special(C, SoFar, NewSoFar),
   82	tokenize(not_in_word, T, [], NewSoFar, Tokens).
   83
   84token_stream_of('', []).
   85token_stream_of(Intext, InTokens) :-
   86	atom_chars(Intext, Chars),
   87	tokenize(not_in_word, Chars, [] , [] , InTokensBackwards),
   88	reverse(InTokensBackwards, InTokens).
   89
   90% -TokenList, +Atom   turn a list of tokens into an atom
   91detokenize([], '').
   92detokenize(TokenList, Atom) :- detokenize(TokenList, '', Atom).
   93detokenize(List, AtomSoFar, Atom) :-
   94	elements_to_atoms(List, [], ListOfAtoms),
   95	atom_concat(ListOfAtoms, ' ',  Atom).
   96
   97elements_to_atoms([] , A, A).
   98elements_to_atoms([word(X)|T], [X|SoFar], Final) :-
   99	atom(X),
  100	elements_to_atoms(T, SoFar, Final).
  101elements_to_atoms([nt(_)|T], SoFar, Final) :-
  102	elements_to_atoms(T, SoFar, Final).
  103elements_to_atoms([special(X)|T], [X|SoFar], Final) :-
  104	atom(X),
  105	elements_to_atoms(T, [X|SoFar], Final)