GCC Code Coverage Report
Directory: ../src/ Exec Total Coverage
File: /home/joels/Current/lispbm/src/tokpar.c Lines: 245 270 90.7 %
Date: 2024-12-05 14:36:58 Branches: 206 283 72.8 %

Line Branch Exec Source
1
/*
2
    Copyright 2019, 2021, 2022 Joel Svensson  svenssonjoel@yahoo.se
3
4
    This program is free software: you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation, either version 3 of the License, or
7
    (at your option) any later version.
8
9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13
14
    You should have received a copy of the GNU General Public License
15
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
*/
17
18
#include <stdbool.h>
19
#include <ctype.h>
20
#include <string.h>
21
#include <stdlib.h>
22
23
#include "lbm_memory.h"
24
#include "lbm_types.h"
25
#include "lbm_channel.h"
26
#include "tokpar.h"
27
#include "symrepr.h"
28
#include "heap.h"
29
#include "env.h"
30
31
// +1 to ensure there is always a zero at last ix
32
char tokpar_sym_str[TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH+1];
33
34
typedef struct {
35
  const char *str;
36
  uint32_t token;
37
  uint32_t len;
38
} matcher;
39
40
/*
41
  \#\a -> 7                 ; control-g
42
  \#\b -> 8                 ; backspace, BS
43
  \#\t -> 9                 ; tab, TAB
44
  \#\n -> 10                ; newline
45
  \#\v -> 11                ; vertical tab
46
  \#\f -> 12                ; formfeed character
47
  \#\r -> 13                ; carriage return, RET
48
  \#\e -> 27                ; escape character, ESC
49
  \#\s -> 32                ; space character, SPC
50
  \#\\ -> 92                ; backslash character, \
51
  \#\d -> 127               ; delete character, DEL
52
*/
53
54
#define NUM_SPECIAL_CHARS 11
55
const char special_chars[NUM_SPECIAL_CHARS][2] =
56
  {{'a', '\a'},
57
   {'b', '\b'},
58
   {'t', '\t'},
59
   {'n', '\n'},
60
   {'v', '\v'},
61
   {'f', '\f'},
62
   {'r', '\r'},
63
   {'e', 27},
64
   {'s', 32},
65
   {'\\', '\\'},
66
   {'d', 127}};
67
68
#define NUM_FIXED_SIZE_TOKENS 16
69
const matcher fixed_size_tokens[NUM_FIXED_SIZE_TOKENS] = {
70
  {"(", TOKOPENPAR, 1},
71
  {")", TOKCLOSEPAR, 1},
72
  {"[", TOKOPENBRACK, 1},
73
  {"]", TOKCLOSEBRACK, 1},
74
  {".", TOKDOT, 1},
75
  {"_", TOKDONTCARE, 1},
76
  {"'", TOKQUOTE, 1},
77
  {"`", TOKBACKQUOTE, 1},
78
  {",@", TOKCOMMAAT, 2},
79
  {",", TOKCOMMA, 1},
80
  {"?", TOKMATCHANY, 1},
81
  {"{", TOKOPENCURL, 1},
82
  {"}", TOKCLOSECURL, 1},
83
  {"@const-start", TOKCONSTSTART, 12},
84
  {"@const-end", TOKCONSTEND, 10},
85
};
86
87
#define NUM_TYPE_QUALIFIERS 9
88
const matcher type_qual_table[NUM_TYPE_QUALIFIERS] = {
89
  {"f64", TOKTYPEF64,  3},
90
  {"f32", TOKTYPEF32,  3},
91
  {"i64", TOKTYPEI64,  3},
92
  {"u64", TOKTYPEU64,  3},
93
  {"i32", TOKTYPEI32,  3},
94
  {"u32", TOKTYPEU32,  3},
95
  {"i"  , TOKTYPEI,    1},
96
  {"u"  , TOKTYPEU,    1},
97
  {"b"  , TOKTYPEBYTE, 1}
98
};
99
100
9045606
static int tok_match_fixed_size_tokens(lbm_char_channel_t *chan, const matcher *m, unsigned int start_pos, unsigned int num, uint32_t *res) {
101
102
104400708
  for (unsigned int i = 0; i < num; i ++) {
103
101054148
    uint32_t tok_len = m[i].len;
104
101054148
    const char *match_str = m[i].str;
105
    char c;
106
    int char_pos;
107
102563712
    for (char_pos = 0; char_pos < (int)tok_len; char_pos ++) {
108
96864666
      int r = lbm_channel_peek(chan,(unsigned int)char_pos + start_pos, &c);
109
96864666
      if (r == CHANNEL_SUCCESS) {
110
96864106
        if (c != match_str[char_pos]) break;
111
560
      } else if (r == CHANNEL_MORE ) {
112
5699046
        return TOKENIZER_NEED_MORE;
113
      } else {
114
560
        break;
115
      }
116
    }
117
118
101054148
    if (char_pos == (int)tok_len) { //match
119
5699046
      *res = m[i].token;
120
5699046
      return (int)tok_len;
121
    }
122
  }
123
3346560
  return TOKENIZER_NO_TOKEN;
124
}
125
126
5671998
int tok_syntax(lbm_char_channel_t *chan, uint32_t *res) {
127
5671998
  return tok_match_fixed_size_tokens(chan, fixed_size_tokens, 0, NUM_FIXED_SIZE_TOKENS, res);
128
}
129
130
3433717
static bool alpha_char(char c) {
131

3440801
  return ((c >= 'a' && c <= 'z') ||
132
7084
          (c >= 'A' && c <= 'Z'));
133
}
134
135
1067448
static bool num_char(char c) {
136

1067448
  return (c >= '0' && c <= '9');
137
}
138
139
885450
static bool symchar0(char c) {
140
885450
  const char *allowed = "+-*/=<>#!";
141
142
885450
  if (alpha_char(c)) return true;
143
54992
  int i = 0;
144
172228
  while (allowed[i] != 0) {
145
172060
    if (c == allowed[i]) return true;
146
117236
    i ++;
147
  }
148
168
  return false;
149
}
150
151
2548267
static bool symchar(char c) {
152
2548267
  const char *allowed = "+-*/=<>!?_";
153
154

2548267
  if (alpha_char(c) || num_char(c)) return true;
155
949680
  int i = 0;
156
9813812
  while (allowed[i] != 0) {
157
8940212
    if (c == allowed[i]) return true;
158
8864132
    i++;
159
  }
160
873600
  return false;
161
}
162
163
885450
int tok_symbol(lbm_char_channel_t *chan) {
164
165
  char c;
166
885450
  int r = 0;
167
168
885450
  r = lbm_channel_peek(chan, 0, &c);
169
885450
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
170
885450
  if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
171

885450
  if (r == CHANNEL_SUCCESS && !symchar0(c)) {
172
168
    return TOKENIZER_NO_TOKEN;
173
  }
174
885282
  memset(tokpar_sym_str,0,TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH+1);
175
885282
  tokpar_sym_str[0] = (char)tolower(c);
176
177
885282
  int len = 1;
178
179
885282
  r = lbm_channel_peek(chan,(unsigned int)len, &c);
180

2559949
  while (r == CHANNEL_SUCCESS && symchar(c)) {
181
1674667
    if (len >= 255) return TOKENIZER_SYMBOL_ERROR;
182
1674667
    c = (char)tolower(c);
183
1674667
    if (len < TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH) {
184
1674667
      tokpar_sym_str[len] = (char)c;
185
    }
186
1674667
    len ++;
187
1674667
    r = lbm_channel_peek(chan,(unsigned int)len, &c);
188
  }
189
885282
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
190
885276
  tokpar_sym_str[len] = 0;
191
885276
  return len;
192
}
193
194
224
static char translate_escape_char(char c) {
195

224
  switch(c) {
196
  case '\\': return '\\';
197
56
  case 'n': return '\n';
198
  case 'r': return '\r';
199
  case 't': return '\t';
200
  case '0': return '\0';
201
168
  case '\"': return '\"';
202
  default: return '\\';
203
  }
204
}
205
206
4264942
int tok_string(lbm_char_channel_t *chan, unsigned int *string_len) {
207
208
4264942
  unsigned int n = 0;
209
4264942
  unsigned int len = 0;
210
  char c;
211
4264942
  int r = 0;
212
4264942
  bool encode = false;
213
214
4264942
  r = lbm_channel_peek(chan,0,&c);
215
4264942
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
216
4264942
  else if (r == CHANNEL_END) return TOKENIZER_NO_TOKEN;
217
218
4264942
  if (c != '\"') return TOKENIZER_NO_TOKEN;;
219
9380
  n++;
220
221
9380
  memset(tokpar_sym_str,0,TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH+1);
222
223
  // read string into buffer
224
9380
  r = lbm_channel_peek(chan,n,&c);
225


61936
  while (r == CHANNEL_SUCCESS && (c != '\"' || encode) &&
226
	 len < TOKENIZER_MAX_SYMBOL_AND_STRING_LENGTH) {
227

52556
    if (c == '\\' && !encode) {
228
224
      encode = true;
229
    } else {
230
52332
      tokpar_sym_str[len] = encode ? translate_escape_char(c) : c ;
231
52332
      len++;
232
52332
      encode = false;
233
    }
234
52556
    n ++;
235
52556
    r = lbm_channel_peek(chan, n, &c);
236
  }
237
238
9380
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
239
9380
  if (c != '\"') return TOKENIZER_STRING_ERROR;
240
241
9380
  *string_len = len;
242
9380
  n ++;
243
9380
  return (int)n;
244
}
245
246
168
int tok_char(lbm_char_channel_t *chan, char *res) {
247
248
  char c;
249
  int r;
250
251
168
  r = lbm_channel_peek(chan, 0, &c);
252
168
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
253
168
  if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
254
255
168
  if (c != '\\') return TOKENIZER_NO_TOKEN;
256
257
168
  r = lbm_channel_peek(chan, 1, &c);
258
168
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
259
168
  if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
260
261
168
  if (c != '#') return TOKENIZER_NO_TOKEN;
262
263
168
  r = lbm_channel_peek(chan, 2, &c);
264
168
  if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
265
168
  if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
266
267
168
  if (c == '\\') {
268
    r = lbm_channel_peek(chan, 3, &c);
269
    if (r == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
270
    if (r == CHANNEL_END)  return TOKENIZER_NO_TOKEN;
271
272
    bool ok = false;
273
    for (int i = 0; i < NUM_SPECIAL_CHARS; i ++) {
274
      if (c == special_chars[i][0]) {
275
        *res = special_chars[i][1];
276
        ok = true;
277
      }
278
    }
279
    if (ok) {
280
      return 4;
281
    } else {
282
      return TOKENIZER_CHAR_ERROR;
283
    }
284
  }
285
168
  *res = c;
286
168
  return 3;
287
}
288
289
4255562
int tok_double(lbm_char_channel_t *chan, token_float *result) {
290
291
4255562
  unsigned int n = 0;
292
  char fbuf[128];
293
  char c;
294
4255562
  bool valid_num = false;
295
  int res;
296
297
4255562
  memset(fbuf, 0, 128);
298
299
4255562
  result->type = TOKTYPEF32;
300
4255562
  result->negative = false;
301
302
4255562
  res = lbm_channel_peek(chan, 0, &c);
303
4255562
  if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
304
4255562
  else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
305
4255562
  if (c == '-') {
306
5488
    n = 1;
307
5488
    fbuf[0] = '-';
308
5488
    result->negative = true;
309
  }
310
311
4255562
  res = lbm_channel_peek(chan, n, &c);
312
4255562
  if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
313
4255562
  else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
314

8514449
  while (c >= '0' && c <= '9') {
315
4258944
    fbuf[n] = c;
316
4258944
    n++;
317
4258944
    res = lbm_channel_peek(chan, n, &c);
318
4258944
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
319
4258943
    if (res == CHANNEL_END) break;
320
  }
321
322
4255561
  if (c == '.') {
323
13132
    fbuf[n] = c;
324
13132
    n ++;
325
  }
326
327
4242429
  else return TOKENIZER_NO_TOKEN;
328
329
13132
  res = lbm_channel_peek(chan,n, &c);
330
13132
  if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
331
13132
  else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
332

13132
  if (!(c >= '0' && c <= '9')) return TOKENIZER_NO_TOKEN;
333
334

31220
  while (c >= '0' && c <= '9') {
335
18088
    fbuf[n] = c;
336
18088
    n++;
337
18088
    res = lbm_channel_peek(chan, n, &c);
338
18088
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
339
18088
    if (res == CHANNEL_END) break;
340
  }
341
342
13132
  if (c == 'e') {
343
112
    fbuf[n] = c;
344
112
    n++;
345
112
    res = lbm_channel_peek(chan,n, &c);
346
112
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
347
112
    else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
348

112
    if (!((c >= '0' && c <= '9') || c == '-')) return TOKENIZER_NO_TOKEN;
349
350

224
    while ((c >= '0' && c <= '9') || c == '-') {
351
112
      fbuf[n] = c;
352
112
      n++;
353
112
      res = lbm_channel_peek(chan, n, &c);
354
112
      if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
355
112
      if (res == CHANNEL_END) break;
356
    }
357
  }
358
359
  uint32_t tok_res;
360
13132
  int type_len = tok_match_fixed_size_tokens(chan, type_qual_table, n, NUM_TYPE_QUALIFIERS, &tok_res);
361
362
13132
  if (type_len == TOKENIZER_NEED_MORE) return type_len;
363
13132
  if (type_len == TOKENIZER_NO_TOKEN) {
364
7224
    result->type = TOKTYPEF32;
365
  } else {
366
5908
    result->type = tok_res;
367
  }
368
369

13132
  if ((result->negative && n > 1) ||
370

13132
      (!result->negative && n > 0)) valid_num = true;
371
372
13132
  if (n > 127) {
373
    return TOKENIZER_NO_TOKEN;
374
  }
375
376
13132
  if(valid_num) {
377
13132
    result->value = (double)strtod(fbuf,NULL);
378
13132
    return (int)n + type_len;
379
  }
380
  return TOKENIZER_NO_TOKEN;
381
}
382
383
5694160
bool tok_clean_whitespace(lbm_char_channel_t *chan) {
384
385
5694160
  bool cleaning_whitespace = true;
386
  char c;
387
  int r;
388
389
11371646
  while (cleaning_whitespace) {
390
391
5699648
    if (lbm_channel_comment(chan)) {
392
      while (true) {
393
662207
        r = lbm_channel_peek(chan, 0, &c);
394
662207
        if (r == CHANNEL_END) {
395
          lbm_channel_set_comment(chan, false);
396
          cleaning_whitespace = false;
397
          break;
398
        }
399
662207
        if (r == CHANNEL_MORE) {
400
672
          return false;
401
        }
402
661535
        lbm_channel_drop(chan,1);
403
661535
        if (c == '\n') {
404
5481
          lbm_channel_set_comment(chan, false);
405
5481
          break;
406
        }
407
      }
408
    }
409
410
    do {
411
10222932
      r = lbm_channel_peek(chan, 0, &c);
412
10222932
      if (r == CHANNEL_MORE) {
413
25
        return false;
414
10222907
      } else if (r == CHANNEL_END) {
415
21465
        return true;
416
      }
417
10201442
      if (c == ';') {
418
5488
        lbm_channel_set_comment(chan, true);
419
5488
        break;
420
      }
421
10195954
      if (isspace(c)) {
422
4523956
        lbm_channel_drop(chan,1);
423
      } else {
424
5671998
        cleaning_whitespace = false;
425
      }
426
427
10195954
    } while (cleaning_whitespace);
428
  }
429
5671998
  return true;
430
}
431
432
4242429
int tok_integer(lbm_char_channel_t *chan, token_int *result) {
433
4242429
  uint64_t acc = 0;
434
4242429
  unsigned int n = 0;
435
4242429
  bool valid_num = false;
436
  char c;
437
  int res;
438
439
4242429
  result->type = TOKTYPEI;
440
4242429
  result-> negative = false;
441
4242429
  res = lbm_channel_peek(chan, 0, &c);
442
4242429
  if (res == CHANNEL_MORE) {
443
    return TOKENIZER_NEED_MORE;
444
4242429
  } else if (res == CHANNEL_END) {
445
    return TOKENIZER_NO_TOKEN;
446
  }
447
4242429
  if (c == '-') {
448
5264
    n = 1;
449
5264
    result->negative = true;
450
  }
451
452
4242429
  bool hex = false;
453
4242429
  res = lbm_channel_peek(chan, n, &c);
454

4242429
  if (res == CHANNEL_SUCCESS && c == '0') {
455
28423
    res = lbm_channel_peek(chan, n + 1, &c);
456

28423
    if ( res == CHANNEL_SUCCESS && (c == 'x' || c == 'X')) {
457
15739
      hex = true;
458
12684
    } else if (res == CHANNEL_MORE) {
459
      return TOKENIZER_NEED_MORE;
460
    }
461
4214006
  } else if (res == CHANNEL_MORE) {
462
    return TOKENIZER_NEED_MORE;
463
  }
464
465
4242429
  if (hex) {
466
15739
    n += 2;
467
468
15739
    res = lbm_channel_peek(chan,n, &c);
469
470
15739
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
471
15738
    else if (res == CHANNEL_END) return TOKENIZER_NO_TOKEN;
472
473

51692
    while ((c >= '0' && c <= '9') ||
474

25594
           (c >= 'a' && c <= 'f') ||
475

25566
           (c >= 'A' && c <= 'F')) {
476
      uint32_t val; /* values between 0 and 16 */
477

35956
      if (c >= 'a' && c <= 'f') {
478
28
        val = 10 + (uint32_t)c - 'a';
479

35928
      } else if (c >= 'A' && c <= 'F') {
480
9830
        val = 10 + (uint32_t)(c - 'A');
481
      } else {
482
26098
        val = (uint32_t)c - '0';
483
      }
484
35956
      acc = (acc * 0x10) + val;
485
35956
      n++;
486
35956
      res = lbm_channel_peek(chan, n, &c);
487
35956
      if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
488
35954
      if (res == CHANNEL_END) break;
489
490
    }
491
  } else {
492
4226690
    res = lbm_channel_peek(chan, n, &c);
493
4226690
    if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
494

8456034
    while (c >= '0' && c <= '9') {
495
4229400
      acc = (acc*10) + (uint32_t)(c - '0');
496
4229400
      n++;
497
4229400
      res = lbm_channel_peek(chan, n, &c);
498
4229400
      if (res == CHANNEL_MORE) return TOKENIZER_NEED_MORE;
499
4229400
      if (res == CHANNEL_END)  break;
500
    }
501
  }
502
503
4242426
  if (n == 0) return TOKENIZER_NO_TOKEN;
504
505
  uint32_t tok_res;
506
3360476
  int type_len = tok_match_fixed_size_tokens(chan, type_qual_table, n, NUM_TYPE_QUALIFIERS, &tok_res);
507
508
3360476
  if (type_len == TOKENIZER_NEED_MORE) return type_len;
509
3360476
  if (type_len != TOKENIZER_NO_TOKEN) {
510
21140
    result->type = tok_res;
511
  }
512
513

3360476
  if ((result->negative && n > 1) ||
514
3360476
      !result->negative) valid_num = true;
515
516
3360476
  if (valid_num) {
517
3356976
    result->value = acc;
518
3356976
    return (int)n + type_len;
519
  }
520
3500
  return TOKENIZER_NO_TOKEN;
521
}