GCC Code Coverage Report
Directory: ../src/ Exec Total Coverage
File: /home/joels/Current/lispbm/src/tokpar.c Lines: 245 270 90.7 %
Date: 2025-04-09 11:39:30 Branches: 205 283 72.4 %

Line Branch Exec Source
1
/*
2
    Copyright 2019, 2021, 2022 Joel Svensson  svenssonjoel@yahoo.se
3
4
    This program is free software: you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation, either version 3 of the License, or
7
    (at your option) any later version.
8
9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13
14
    You should have received a copy of the GNU General Public License
15
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
*/
17
18
#include <stdbool.h>
19
#include <ctype.h>
20
#include <string.h>
21
#include <stdlib.h>
22
23
#include "lbm_memory.h"
24
#include "lbm_types.h"
25
#include "lbm_channel.h"
26
#include "tokpar.h"
27
#include "symrepr.h"
28
#include "heap.h"
29
#include "env.h"
30
31
// +1 to ensure there is always a zero at last ix
32
char tokpar_sym_str[TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH+1];
33
34
typedef struct {
35
  const char *str;
36
  uint16_t token;
37
  uint16_t len;
38
} matcher;
39
40
/*
41
  \#\a -> 7                 ; control-g
42
  \#\b -> 8                 ; backspace, BS
43
  \#\t -> 9                 ; tab, TAB
44
  \#\n -> 10                ; newline
45
  \#\v -> 11                ; vertical tab
46
  \#\f -> 12                ; formfeed character
47
  \#\r -> 13                ; carriage return, RET
48
  \#\e -> 27                ; escape character, ESC
49
  \#\s -> 32                ; space character, SPC
50
  \#\\ -> 92                ; backslash character, \
51
  \#\d -> 127               ; delete character, DEL
52
*/
53
54
#define NUM_SPECIAL_CHARS 11
55
const char special_chars[NUM_SPECIAL_CHARS][2] =
56
  {{'a', '\a'},
57
   {'b', '\b'},
58
   {'t', '\t'},
59
   {'n', '\n'},
60
   {'v', '\v'},
61
   {'f', '\f'},
62
   {'r', '\r'},
63
   {'e', 27},
64
   {'s', 32},
65
   {'\\', '\\'},
66
   {'d', 127}};
67
68
#define NUM_FIXED_SIZE_TOKENS 18
69
const matcher fixed_size_tokens[NUM_FIXED_SIZE_TOKENS] = {
70
  {"(", TOKOPENPAR, 1},
71
  {")", TOKCLOSEPAR, 1},
72
  {"[|", TOKOPENARRAY, 2},
73
  {"[", TOKOPENBRACK, 1},
74
  {"]", TOKCLOSEBRACK, 1},
75
  {".", TOKDOT, 1},
76
  {"_", TOKDONTCARE, 1},
77
  {"'", TOKQUOTE, 1},
78
  {"`", TOKBACKQUOTE, 1},
79
  {",@", TOKCOMMAAT, 2},
80
  {",", TOKCOMMA, 1},
81
  {"?", TOKMATCHANY, 1},
82
  {"{", TOKOPENCURL, 1},
83
  {"}", TOKCLOSECURL, 1},
84
  {"|]", TOKCLOSEARRAY, 2},
85
  {"@const-start", TOKCONSTSTART, 12},
86
  {"@const-end", TOKCONSTEND, 10},
87
};
88
89
#define NUM_TYPE_QUALIFIERS 9
90
const matcher type_qual_table[NUM_TYPE_QUALIFIERS] = {
91
  {"f64", TOKTYPEF64,  3},
92
  {"f32", TOKTYPEF32,  3},
93
  {"i64", TOKTYPEI64,  3},
94
  {"u64", TOKTYPEU64,  3},
95
  {"i32", TOKTYPEI32,  3},
96
  {"u32", TOKTYPEU32,  3},
97
  {"i"  , TOKTYPEI,    1},
98
  {"u"  , TOKTYPEU,    1},
99
  {"b"  , TOKTYPEBYTE, 1}
100
};
101
102
9144420
static int tok_match_fixed_size_tokens(lbm_char_channel_t *chan, const matcher *m, unsigned int start_pos, unsigned int num, uint32_t *res) {
103
104
113969706
  for (unsigned int i = 0; i < num; i ++) {
105
110618778
    uint32_t tok_len = m[i].len;
106
110618778
    const char *match_str = m[i].str;
107
    char c;
108
    int char_pos;
109
112184880
    for (char_pos = 0; char_pos < (int)tok_len; char_pos ++) {
110
106391388
      int r = lbm_channel_peek(chan,(unsigned int)char_pos + start_pos, &c);
111
106391388
      if (r == CHANNEL_SUCCESS) {
112
106390828
        if (c != match_str[char_pos]) break;
113
560
      } else if (r == CHANNEL_MORE ) {
114
5793492
        return TOKENIZER_NEED_MORE;
115
      } else {
116
560
        break;
117
      }
118
    }
119
120
110618778
    if (char_pos == (int)tok_len) { //match
121
5793492
      *res = m[i].token;
122
5793492
      return (int)tok_len;
123
    }
124
  }
125
3350928
  return TOKENIZER_NO_TOKEN;
126
}
127
128
5766276
int tok_syntax(lbm_char_channel_t *chan, uint32_t *res) {
129
5766276
  return tok_match_fixed_size_tokens(chan, fixed_size_tokens, 0, NUM_FIXED_SIZE_TOKENS, res);
130
}
131
132
3626428
static bool alpha_char(char c) {
133

3633764
  return ((c >= 'a' && c <= 'z') ||
134
7336
          (c >= 'A' && c <= 'Z'));
135
}
136
137
1112155
static bool num_char(char c) {
138

1112155
  return (c >= '0' && c <= '9');
139
}
140
141
924733
static bool symchar0(char c) {
142
924733
  const char *allowed = "+-*/=<>#!";
143
144
924733
  if (alpha_char(c)) return true;
145
57596
  int i = 0;
146
182532
  while (allowed[i] != 0) {
147
182364
    if (c == allowed[i]) return true;
148
124936
    i ++;
149
  }
150
168
  return false;
151
}
152
153
2701695
static bool symchar(char c) {
154
2701695
  const char *allowed = "+-*/=<>!?_";
155
156

2701695
  if (alpha_char(c) || num_char(c)) return true;
157
994610
  int i = 0;
158
10260412
  while (allowed[i] != 0) {
159
9347532
    if (c == allowed[i]) return true;
160
9265802
    i++;
161
  }
162
912880
  return false;
163
}
164
165
924733
int tok_symbol(lbm_char_channel_t *chan) {
166
167
  char c;
168
924733
  int r = 0;
169
170
924733
  r = lbm_channel_peek(chan, 0, &c);
171
924733
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
172
924733
  if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
173

924733
  if (r == CHANNEL_SUCCESS && !symchar0(c)) {
174
168
    return TOKENIZER_NO_TOKEN;
175
  }
176
924565
  memset(tokpar_sym_str,0,TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH+1);
177
924565
  tokpar_sym_str[0] = (char)tolower(c);
178
179
924565
  int len = 1;
180
181
924565
  r = lbm_channel_peek(chan,(unsigned int)len, &c);
182

2713380
  while (r == CHANNEL_SUCCESS && symchar(c)) {
183
1788815
    if (len >= 255) return TOKENIZER_SYMBOL_ERROR;
184
1788815
    c = (char)tolower(c);
185
1788815
    if (len < TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH) {
186
1788815
      tokpar_sym_str[len] = (char)c;
187
    }
188
1788815
    len ++;
189
1788815
    r = lbm_channel_peek(chan,(unsigned int)len, &c);
190
  }
191
924565
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
192
924556
  tokpar_sym_str[len] = 0;
193
924556
  return len;
194
}
195
196
224
static char translate_escape_char(char c) {
197

224
  switch(c) {
198
  case '\\': return '\\';
199
56
  case 'n': return '\n';
200
  case 'r': return '\r';
201
  case 't': return '\t';
202
  case '0': return '\0';
203
168
  case '\"': return '\"';
204
  default: return '\\';
205
  }
206
}
207
208
4308674
int tok_string(lbm_char_channel_t *chan, unsigned int *string_len) {
209
210
4308674
  unsigned int n = 0;
211
4308674
  unsigned int len = 0;
212
  char c;
213
4308674
  int r = 0;
214
4308674
  bool encode = false;
215
216
4308674
  r = lbm_channel_peek(chan,0,&c);
217
4308674
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
218
4308674
  else if (r == CHANNEL_END) return TOKENIZER_NO_TOKEN;
219
220
4308674
  if (c != '\"') return TOKENIZER_NO_TOKEN;;
221
9381
  n++;
222
223
9381
  memset(tokpar_sym_str,0,TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH+1);
224
225
  // read string into buffer
226
9381
  r = lbm_channel_peek(chan,n,&c);
227


61938
  while (r == CHANNEL_SUCCESS && (c != '\"' || encode) &&
228
	 len < TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH) {
229

52557
    if (c == '\\' && !encode) {
230
224
      encode = true;
231
    } else {
232
52333
      tokpar_sym_str[len] = encode ? translate_escape_char(c) : c ;
233
52333
      len++;
234
52333
      encode = false;
235
    }
236
52557
    n ++;
237
52557
    r = lbm_channel_peek(chan, n, &c);
238
  }
239
240
9381
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
241
9380
  if (c != '\"') return TOKENIZER_STRING_ERROR;
242
243
9380
  *string_len = len;
244
9380
  n ++;
245
9380
  return (int)n;
246
}
247
248
168
int tok_char(lbm_char_channel_t *chan, char *res) {
249
250
  char c;
251
  int r;
252
253
168
  r = lbm_channel_peek(chan, 0, &c);
254
168
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
255
168
  if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
256
257
168
  if (c != '\\') return TOKENIZER_NO_TOKEN;
258
259
168
  r = lbm_channel_peek(chan, 1, &c);
260
168
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
261
168
  if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
262
263
168
  if (c != '#') return TOKENIZER_NO_TOKEN;
264
265
168
  r = lbm_channel_peek(chan, 2, &c);
266
168
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
267
168
  if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
268
269
168
  if (c == '\\') {
270
    r = lbm_channel_peek(chan, 3, &c);
271
    if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
272
    if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
273
274
    bool ok = false;
275
    for (int i = 0; i < NUM_SPECIAL_CHARS; i ++) {
276
      if (c == special_chars[i][0]) {
277
        *res = special_chars[i][1];
278
        ok = true;
279
      }
280
    }
281
    if (ok) {
282
      return 4;
283
    } else {
284
      return TOKENIZER_CHAR_ERROR;
285
    }
286
  }
287
168
  *res = c;
288
168
  return 3;
289
}
290
291
4299293
int tok_double(lbm_char_channel_t *chan, token_float *result) {
292
293
4299293
  unsigned int n = 0;
294
  char fbuf[128];
295
  char c;
296
4299293
  bool valid_num = false;
297
  int res;
298
299
4299293
  memset(fbuf, 0, 128);
300
301
4299293
  result->type = TOKTYPEF32;
302
4299293
  result->negative = false;
303
304
4299293
  res = lbm_channel_peek(chan, 0, &c);
305
4299293
  if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
306
4299293
  else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
307
4299293
  if (c == '-') {
308
5600
    n = 1;
309
5600
    fbuf[0] = '-';
310
5600
    result->negative = true;
311
  }
312
313
4299293
  res = lbm_channel_peek(chan, n, &c);
314
4299293
  if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
315
4299293
  else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
316

8566493
  while (c >= '0' && c <= '9') {
317
4267256
    fbuf[n] = c;
318
4267256
    n++;
319
4267256
    res = lbm_channel_peek(chan, n, &c);
320
4267256
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
321
4267256
    if (res == CHANNEL_END) break;
322
  }
323
324
4299293
  if (c == '.') {
325
11928
    fbuf[n] = c;
326
11928
    n ++;
327
  }
328
329
4287365
  else return TOKENIZER_NO_TOKEN;
330
331
11928
  res = lbm_channel_peek(chan,n, &c);
332
11928
  if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
333
11928
  else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
334

11928
  if (!(c >= '0' && c <= '9')) return TOKENIZER_NO_TOKEN;
335
336

28784
  while (c >= '0' && c <= '9') {
337
16856
    fbuf[n] = c;
338
16856
    n++;
339
16856
    res = lbm_channel_peek(chan, n, &c);
340
16856
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
341
16856
    if (res == CHANNEL_END) break;
342
  }
343
344
11928
  if (c == 'e') {
345
112
    fbuf[n] = c;
346
112
    n++;
347
112
    res = lbm_channel_peek(chan,n, &c);
348
112
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
349
112
    else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
350

112
    if (!((c >= '0' && c <= '9') || c == '-')) return TOKENIZER_NO_TOKEN;
351
352

224
    while ((c >= '0' && c <= '9') || c == '-') {
353
112
      fbuf[n] = c;
354
112
      n++;
355
112
      res = lbm_channel_peek(chan, n, &c);
356
112
      if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
357
112
      if (res == CHANNEL_END) break;
358
    }
359
  }
360
361
  uint32_t tok_res;
362
11928
  int type_len = tok_match_fixed_size_tokens(chan, type_qual_table, n, NUM_TYPE_QUALIFIERS, &tok_res);
363
364
11928
  if (type_len == TOKENIZER_NEED_MORE) return type_len;
365
11928
  if (type_len == TOKENIZER_NO_TOKEN) {
366
5908
    result->type = TOKTYPEF32;
367
  } else {
368
6020
    result->type = tok_res;
369
  }
370
371

11928
  if ((result->negative && n > 1) ||
372

11928
      (!result->negative && n > 0)) valid_num = true;
373
374
11928
  if (n > 127) {
375
    return TOKENIZER_NO_TOKEN;
376
  }
377
378
11928
  if(valid_num) {
379
11928
    result->value = (double)strtod(fbuf,NULL);
380
11928
    return (int)n + type_len;
381
  }
382
  return TOKENIZER_NO_TOKEN;
383
}
384
385
5788632
bool tok_clean_whitespace(lbm_char_channel_t *chan) {
386
387
5788632
  bool cleaning_whitespace = true;
388
  char c;
389
  int r;
390
391
11560900
  while (cleaning_whitespace) {
392
393
5794624
    if (lbm_channel_comment(chan)) {
394
      while (true) {
395
683807
        r = lbm_channel_peek(chan, 0, &c);
396
683807
        if (r == CHANNEL_END) {
397
          lbm_channel_set_comment(chan, false);
398
          cleaning_whitespace = false;
399
          break;
400
        }
401
683807
        if (r == CHANNEL_MORE) {
402
617
          return false;
403
        }
404
683190
        lbm_channel_drop(chan,1);
405
683190
        if (c == '\n') {
406
5990
          lbm_channel_set_comment(chan, false);
407
5990
          break;
408
        }
409
      }
410
    }
411
412
    do {
413
10389943
      r = lbm_channel_peek(chan, 0, &c);
414
10389943
      if (r == CHANNEL_MORE) {
415
19
        return false;
416
10389924
      } else if (r == CHANNEL_END) {
417
21720
        return true;
418
      }
419
10368204
      if (c == ';') {
420
5992
        lbm_channel_set_comment(chan, true);
421
5992
        break;
422
      }
423
10362212
      if (isspace(c)) {
424
4595936
        lbm_channel_drop(chan,1);
425
      } else {
426
5766276
        cleaning_whitespace = false;
427
      }
428
429
10362212
    } while (cleaning_whitespace);
430
  }
431
5766276
  return true;
432
}
433
434
4287365
int tok_integer(lbm_char_channel_t *chan, token_int *result) {
435
4287365
  uint64_t acc = 0;
436
4287365
  unsigned int n = 0;
437
4287365
  bool valid_num = false;
438
  char c;
439
  int res;
440
441
4287365
  result->type = TOKTYPEI;
442
4287365
  result-> negative = false;
443
4287365
  res = lbm_channel_peek(chan, 0, &c);
444
4287365
  if (res == CHANNEL_MORE) {
445
    return TOKENIZER_NEED_MORE;
446
4287365
  } else if (res == CHANNEL_END) {
447
    return TOKENIZER_NO_TOKEN;
448
  }
449
4287365
  if (c == '-') {
450
5348
    n = 1;
451
5348
    result->negative = true;
452
  }
453
454
4287365
  bool hex = false;
455
4287365
  res = lbm_channel_peek(chan, n, &c);
456

4287365
  if (res == CHANNEL_SUCCESS && c == '0') {
457
29316
    res = lbm_channel_peek(chan, n + 1, &c);
458

29316
    if ( res == CHANNEL_SUCCESS && (c == 'x' || c == 'X')) {
459
15736
      hex = true;
460
13580
    } else if (res == CHANNEL_MORE) {
461
      return TOKENIZER_NEED_MORE;
462
    }
463
4258049
  } else if (res == CHANNEL_MORE) {
464
    return TOKENIZER_NEED_MORE;
465
  }
466
467
4287365
  if (hex) {
468
15736
    n += 2;
469
470
15736
    res = lbm_channel_peek(chan,n, &c);
471
472
15736
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
473
15736
    else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
474
475

51688
    while ((c >= '0' && c <= '9') ||
476

25592
           (c >= 'a' && c <= 'f') ||
477

25564
           (c >= 'A' && c <= 'F')) {
478
      uint32_t val; /* values between 0 and 16 */
479

35952
      if (c >= 'a' && c <= 'f') {
480
28
        val = 10 + (uint32_t)c - 'a';
481

35924
      } else if (c >= 'A' && c <= 'F') {
482
9828
        val = 10 + (uint32_t)(c - 'A');
483
      } else {
484
26096
        val = (uint32_t)c - '0';
485
      }
486
35952
      acc = (acc * 0x10) + val;
487
35952
      n++;
488
35952
      res = lbm_channel_peek(chan, n, &c);
489
35952
      if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
490
35952
      if (res == CHANNEL_END) break;
491
492
    }
493
  } else {
494
4271629
    res = lbm_channel_peek(chan, n, &c);
495
4271629
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
496

8510493
    while (c >= '0' && c <= '9') {
497
4238920
      acc = (acc*10) + (uint32_t)(c - '0');
498
4238920
      n++;
499
4238920
      res = lbm_channel_peek(chan, n, &c);
500
4238920
      if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
501
4238920
      if (res == CHANNEL_END)  break;
502
    }
503
  }
504
505
4287365
  if (n == 0) return TOKENIZER_NO_TOKEN;
506
507
  uint32_t tok_res;
508
3366216
  int type_len = tok_match_fixed_size_tokens(chan, type_qual_table, n, NUM_TYPE_QUALIFIERS, &tok_res);
509
510
3366216
  if (type_len == TOKENIZER_NEED_MORE) return type_len;
511
3366216
  if (type_len != TOKENIZER_NO_TOKEN) {
512
21196
    result->type = tok_res;
513
  }
514
515

3366216
  if ((result->negative && n > 1) ||
516
3366216
      !result->negative) valid_num = true;
517
518
3366216
  if (valid_num) {
519
3362632
    result->value = acc;
520
3362632
    return (int)n + type_len;
521
  }
522
3584
  return TOKENIZER_NO_TOKEN;
523
}