xref: /AOO41X/main/soltools/cpp/_lex.c (revision 7ce203730a656307c389f82ee60505af35570552)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__))
26 #include <io.h>
27 #else
28 #include <unistd.h>
29 #endif
30 #include "cpp.h"
31 /*
32  * lexical FSM encoding
33  *   when in state state, and one of the characters
34  *   in ch arrives, enter nextstate.
35  *   States >= S_SELF are either final, or at least require special action.
36  *   In 'fsm' there is a line for each state X charset X nextstate.
37  *   List chars that overwrite previous entries later (e.g. C_ALPH
38  *   can be overridden by '_' by a later entry; and C_XX is the
39  *   the universal set, and should always be first.
40  *   States above S_SELF are represented in the big table as negative values.
41  *   S_SELF and S_SELFB encode the resulting token type in the upper bits.
42  *   These actions differ in that S_SELF doesn't have a lookahead char,
43  *   S_SELFB does.
44  *
45  *   The encoding is blown out into a big table for time-efficiency.
46  *   Entries have
47  *      nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
48  */
49 
50 #define MAXSTATE        32
51 #define ACT(tok,act)    ((tok<<7)+act)
52 #define QBSBIT          0100
53 #define GETACT(st)      ((st>>7)&0x1ff)
54 
55 /* character classes */
56 #define C_WS    1
57 #define C_ALPH  2
58 #define C_NUM   3
59 #define C_EOF   4
60 #define C_XX    5
61 
62 enum state
63 {
64     START = 0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
65     CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
66     CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
67     S_SELF = MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
68     S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
69 };
70 
71 int tottok;
72 int tokkind[256];
73 struct fsm
74 {
75     int state;                          /* if in this state */
76     uchar ch[4];                        /* and see one of these characters */
77     int nextstate;                      /* enter this state if +ve */
78 };
79 
80  /*const*/ struct fsm fsm[] = {
81     /* start state */
82          {START, {C_XX}, ACT(UNCLASS, S_SELF)},
83          {START, {' ', '\t', '\v'}, WS1},
84          {START, {C_NUM}, NUM1},
85          {START, {'.'}, NUM3},
86          {START, {C_ALPH}, ID1},
87          {START, {'L'}, ST1},
88          {START, {'"'}, ST2},
89          {START, {'\''}, CC1},
90          {START, {'/'}, COM1},
91          {START, {EOFC}, S_EOF},
92          {START, {'\n'}, S_NL},
93          {START, {'-'}, MINUS1},
94          {START, {'+'}, PLUS1},
95          {START, {'<'}, LT1},
96          {START, {'>'}, GT1},
97          {START, {'='}, ASG1},
98          {START, {'!'}, NOT1},
99          {START, {'&'}, AND1},
100          {START, {'|'}, OR1},
101          {START, {'#'}, SHARP1},
102          {START, {'%'}, PCT1},
103          {START, {'['}, ACT(SBRA, S_SELF)},
104          {START, {']'}, ACT(SKET, S_SELF)},
105          {START, {'('}, ACT(LP, S_SELF)},
106          {START, {')'}, ACT(RP, S_SELF)},
107          {START, {'*'}, STAR1},
108          {START, {','}, ACT(COMMA, S_SELF)},
109          {START, {'?'}, ACT(QUEST, S_SELF)},
110          {START, {':'}, ACT(COLON, S_SELF)},
111          {START, {';'}, ACT(SEMIC, S_SELF)},
112          {START, {'{'}, ACT(CBRA, S_SELF)},
113          {START, {'}'}, ACT(CKET, S_SELF)},
114          {START, {'~'}, ACT(TILDE, S_SELF)},
115          {START, {'^'}, CIRC1},
116 
117     /* saw a digit */
118          {NUM1, {C_XX}, ACT(NUMBER, S_SELFB)},
119          {NUM1, {C_NUM, C_ALPH, '.'}, NUM1},
120          {NUM1, {'E', 'e'}, NUM2},
121          {NUM1, {'_'}, ACT(NUMBER, S_SELFB)},
122 
123     /* saw possible start of exponent, digits-e */
124          {NUM2, {C_XX}, ACT(NUMBER, S_SELFB)},
125          {NUM2, {'+', '-'}, NUM1},
126          {NUM2, {C_NUM, C_ALPH}, NUM1},
127          {NUM2, {'_'}, ACT(NUMBER, S_SELFB)},
128 
129     /* saw a '.', which could be a number or an operator */
130          {NUM3, {C_XX}, ACT(DOT, S_SELFB)},
131          {NUM3, {'.'}, DOTS1},
132          {NUM3, {C_NUM}, NUM1},
133 
134          {DOTS1, {C_XX}, ACT(UNCLASS, S_SELFB)},
135          {DOTS1, {C_NUM}, NUM1},
136          {DOTS1, {'.'}, ACT(ELLIPS, S_SELF)},
137 
138     /* saw a letter or _ */
139          {ID1, {C_XX}, ACT(NAME, S_NAME)},
140          {ID1, {C_ALPH, C_NUM}, ID1},
141 
142     /* saw L (start of wide string?) */
143          {ST1, {C_XX}, ACT(NAME, S_NAME)},
144          {ST1, {C_ALPH, C_NUM}, ID1},
145          {ST1, {'"'}, ST2},
146          {ST1, {'\''}, CC1},
147 
148     /* saw " beginning string */
149          {ST2, {C_XX}, ST2},
150          {ST2, {'"'}, ACT(STRING, S_SELF)},
151          {ST2, {'\\'}, ST3},
152          {ST2, {'\n'}, S_STNL},
153          {ST2, {EOFC}, S_EOFSTR},
154 
155     /* saw \ in string */
156          {ST3, {C_XX}, ST2},
157          {ST3, {'\n'}, S_STNL},
158          {ST3, {EOFC}, S_EOFSTR},
159 
160     /* saw ' beginning character const */
161          {CC1, {C_XX}, CC1},
162          {CC1, {'\''}, ACT(CCON, S_SELF)},
163          {CC1, {'\\'}, CC2},
164          {CC1, {'\n'}, S_STNL},
165          {CC1, {EOFC}, S_EOFSTR},
166 
167     /* saw \ in ccon */
168          {CC2, {C_XX}, CC1},
169          {CC2, {'\n'}, S_STNL},
170          {CC2, {EOFC}, S_EOFSTR},
171 
172     /* saw /, perhaps start of comment */
173          {COM1, {C_XX}, ACT(SLASH, S_SELFB)},
174          {COM1, {'='}, ACT(ASSLASH, S_SELF)},
175          {COM1, {'*'}, COM2},
176          {COM1, {'/'}, COM4},
177 
178     /* saw / followed by *, start of comment */
179          {COM2, {C_XX}, COM2},
180          {COM2, {'\n'}, S_COMNL},
181          {COM2, {'*'}, COM3},
182          {COM2, {EOFC}, S_EOFCOM},
183 
184     /* saw the * possibly ending a comment */
185          {COM3, {C_XX}, COM2},
186          {COM3, {'\n'}, S_COMNL},
187          {COM3, {'*'}, COM3},
188          {COM3, {'/'}, S_COMMENT},
189 
190     /* // comment */
191          {COM4, {C_XX}, COM4},
192          {COM4, {'\n'}, S_NL},
193          {COM4, {EOFC}, S_EOFCOM},
194 
195     /* saw white space, eat it up */
196          {WS1, {C_XX}, S_WS},
197          {WS1, {'\t', '\v', ' '}, WS1},
198 
199     /* saw -, check --, -=, -> */
200          {MINUS1, {C_XX}, ACT(MINUS, S_SELFB)},
201          {MINUS1, {'-'}, ACT(MMINUS, S_SELF)},
202          {MINUS1, {'='}, ACT(ASMINUS, S_SELF)},
203          {MINUS1, {'>'}, ACT(ARROW, S_SELF)},
204 
205     /* saw +, check ++, += */
206          {PLUS1, {C_XX}, ACT(PLUS, S_SELFB)},
207          {PLUS1, {'+'}, ACT(PPLUS, S_SELF)},
208          {PLUS1, {'='}, ACT(ASPLUS, S_SELF)},
209 
210     /* saw <, check <<, <<=, <= */
211          {LT1, {C_XX}, ACT(LT, S_SELFB)},
212          {LT1, {'<'}, LT2},
213          {LT1, {'='}, ACT(LEQ, S_SELF)},
214          {LT2, {C_XX}, ACT(LSH, S_SELFB)},
215          {LT2, {'='}, ACT(ASLSH, S_SELF)},
216 
217     /* saw >, check >>, >>=, >= */
218          {GT1, {C_XX}, ACT(GT, S_SELFB)},
219          {GT1, {'>'}, GT2},
220          {GT1, {'='}, ACT(GEQ, S_SELF)},
221          {GT2, {C_XX}, ACT(RSH, S_SELFB)},
222          {GT2, {'='}, ACT(ASRSH, S_SELF)},
223 
224     /* = */
225          {ASG1, {C_XX}, ACT(ASGN, S_SELFB)},
226          {ASG1, {'='}, ACT(EQ, S_SELF)},
227 
228     /* ! */
229          {NOT1, {C_XX}, ACT(NOT, S_SELFB)},
230          {NOT1, {'='}, ACT(NEQ, S_SELF)},
231 
232     /* & */
233          {AND1, {C_XX}, ACT(AND, S_SELFB)},
234          {AND1, {'&'}, ACT(LAND, S_SELF)},
235          {AND1, {'='}, ACT(ASAND, S_SELF)},
236 
237     /* | */
238          {OR1, {C_XX}, ACT(OR, S_SELFB)},
239          {OR1, {'|'}, ACT(LOR, S_SELF)},
240          {OR1, {'='}, ACT(ASOR, S_SELF)},
241 
242     /* # */
243          {SHARP1, {C_XX}, ACT(SHARP, S_SELFB)},
244          {SHARP1, {'#'}, ACT(DSHARP, S_SELF)},
245 
246     /* % */
247          {PCT1, {C_XX}, ACT(PCT, S_SELFB)},
248          {PCT1, {'='}, ACT(ASPCT, S_SELF)},
249 
250     /* * */
251          {STAR1, {C_XX}, ACT(STAR, S_SELFB)},
252          {STAR1, {'='}, ACT(ASSTAR, S_SELF)},
253 
254     /* ^ */
255          {CIRC1, {C_XX}, ACT(CIRC, S_SELFB)},
256          {CIRC1, {'='}, ACT(ASCIRC, S_SELF)},
257 
258          {-1, "", 0}
259 };
260 
261 /* first index is char, second is state */
262 /* increase #states to power of 2 to encourage use of shift */
263 short bigfsm[256][MAXSTATE];
264 
265 void
expandlex(void)266     expandlex(void)
267 {
268      /* const */ struct fsm *fp;
269     int i, j, nstate;
270 
271     for (fp = fsm; fp->state >= 0; fp++)
272     {
273         for (i = 0; fp->ch[i]; i++)
274         {
275             nstate = fp->nextstate;
276             if (nstate >= S_SELF)
277                 nstate = ~nstate;
278             switch (fp->ch[i])
279             {
280 
281                 case C_XX:              /* random characters */
282                     for (j = 0; j < 256; j++)
283                         bigfsm[j][fp->state] = (short) nstate;
284                     continue;
285                 case C_ALPH:
286                     for (j = 0; j < 256; j++)
287 #ifdef S390
288                         if( isalpha( j ) || (j == '_') )
289 #else
290                         if (('a' <= j && j <= 'z') || ('A' <= j && j <= 'Z')
291                             || j == '_')
292 #endif
293                             bigfsm[j][fp->state] = (short) nstate;
294                     continue;
295                 case C_NUM:
296                     for (j = '0'; j <= '9'; j++)
297                         bigfsm[j][fp->state] = (short) nstate;
298                     continue;
299                 default:
300                     bigfsm[fp->ch[i]][fp->state] = (short) nstate;
301             }
302         }
303     }
304 
305     /*
306      * install special cases for ? (trigraphs),  \ (splicing), runes, and
307      * EOB
308      */
309     for (i = 0; i < MAXSTATE; i++)
310     {
311         for (j = 0; j < 0xFF; j++)
312             if (j == '?' || j == '\\' || j == '\n' || j == '\r')
313             {
314                 if (bigfsm[j][i] > 0)
315                     bigfsm[j][i] = ~bigfsm[j][i];
316                 bigfsm[j][i] &= ~QBSBIT;
317             }
318         bigfsm[EOB][i] = ~S_EOB;
319         if (bigfsm[EOFC][i] >= 0)
320             bigfsm[EOFC][i] = ~S_EOF;
321     }
322 }
323 
324 void
fixlex(void)325     fixlex(void)
326 {
327     /* do C++ comments? */
328     if ((Cplusplus == 0) || (Cflag != 0))
329         bigfsm['/'][COM1] = bigfsm['x'][COM1];
330 }
331 
332 /*
333  * fill in a row of tokens from input, terminated by NL or END
334  * First token is put at trp->lp.
335  * Reset is non-zero when the input buffer can be "rewound."
336  * The value is a flag indicating that possible macros have
337  * been seen in the row.
338  */
339 int
gettokens(Tokenrow * trp,int reset)340     gettokens(Tokenrow * trp, int reset)
341 {
342     register int c, state, oldstate;
343     register uchar *ip;
344     register Token *tp, *maxp;
345     int runelen;
346     Source *s = cursource;
347     int nmac = 0;
348 
349     tp = trp->lp;
350     ip = s->inp;
351     if (reset)
352     {
353         s->lineinc = 0;
354         if (ip >= s->inl)
355         {                               /* nothing in buffer */
356             s->inl = s->inb;
357             fillbuf(s);
358             ip = s->inp = s->inb;
359         }
360         else
361             if (ip >= s->inb + (3 * INS / 4))
362             {
363                 memmove(s->inb, ip, 4 + s->inl - ip);
364                 s->inl = s->inb + (s->inl - ip);
365                 ip = s->inp = s->inb;
366             }
367     }
368     maxp = &trp->bp[trp->max];
369     runelen = 1;
370     for (;;)
371     {
372 continue2:
373         if (tp >= maxp)
374         {
375             trp->lp = tp;
376             tp = growtokenrow(trp);
377             maxp = &trp->bp[trp->max];
378         }
379         tp->type = UNCLASS;
380         tp->t = ip;
381         tp->wslen = 0;
382         tp->flag = 0;
383         state = START;
384         for (;;)
385         {
386             oldstate = state;
387 
388             c = *ip;
389 
390             if ((state = bigfsm[c][state]) >= 0)
391             {
392                 ip += runelen;
393                 runelen = 1;
394                 continue;
395             }
396             state = ~state;
397     reswitch:
398             switch (state & 0177)
399             {
400                 case S_SELF:
401                     ip += runelen;
402                     runelen = 1;
403                 case S_SELFB:
404                     tp->type = (unsigned char) GETACT(state);
405                     tp->len = ip - tp->t;
406                     tp++;
407                     goto continue2;
408 
409                 case S_NAME:            /* like S_SELFB but with nmac check */
410                     tp->type = NAME;
411                     tp->len = ip - tp->t;
412                     nmac |= quicklook(tp->t[0], tp->len > 1 ? tp->t[1] : 0);
413                     tp++;
414                     goto continue2;
415 
416                 case S_WS:
417                     tp->wslen = ip - tp->t;
418                     tp->t = ip;
419                     state = START;
420                     continue;
421 
422                 default:
423                     if ((state & QBSBIT) == 0)
424                     {
425                         ip += runelen;
426                         runelen = 1;
427                         continue;
428                     }
429                     state &= ~QBSBIT;
430                     s->inp = ip;
431 
432                     if (c == '\n')
433                     {
434                         while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
435 
436                         if (s->inp[1] == '\r')
437                         {
438                             memmove(s->inp + 1, s->inp + 2, s->inl - s->inp + 2);
439                             s->inl -= 1;
440                         }
441 
442                         goto reswitch;
443                     }
444 
445                     if (c == '\r')
446                     {
447                         while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
448 
449                         if (s->inp[1] == '\n')
450                         {
451                             memmove(s->inp, s->inp + 1, s->inl - s->inp + 1);
452                             s->inl -= 1;
453                         }
454                         else
455                             *s->inp = '\n';
456 
457                         state = oldstate;
458                         continue;
459                     }
460 
461                     if (c == '?')
462                     {                   /* check trigraph */
463                         if (trigraph(s))
464                         {
465                             state = oldstate;
466                             continue;
467                         }
468                         goto reswitch;
469                     }
470                     if (c == '\\')
471                     {                   /* line-folding */
472                         if (foldline(s))
473                         {
474                             s->lineinc++;
475                             state = oldstate;
476                             continue;
477                         }
478                         goto reswitch;
479                     }
480                     error(WARNING, "Lexical botch in cpp");
481                     ip += runelen;
482                     runelen = 1;
483                     continue;
484 
485                 case S_EOB:
486                     s->inp = ip;
487                     fillbuf(cursource);
488                     state = oldstate;
489                     continue;
490 
491                 case S_EOF:
492                     tp->type = END;
493                     tp->len = 0;
494                     s->inp = ip;
495                     if (tp != trp->bp && (tp - 1)->type != NL && cursource->fd != -1)
496                         error(WARNING, "No newline at end of file");
497                     trp->lp = tp + 1;
498                     return nmac;
499 
500                 case S_STNL:
501                     error(ERROR, "Unterminated string or char const");
502                 case S_NL:
503                     tp->t = ip;
504                     tp->type = NL;
505                     tp->len = 1;
506                     tp->wslen = 0;
507                     s->lineinc++;
508                     s->inp = ip + 1;
509                     trp->lp = tp + 1;
510                     return nmac;
511 
512                 case S_EOFSTR:
513                     error(FATAL, "EOF in string or char constant");
514                     break;
515 
516                 case S_COMNL:
517                     s->lineinc++;
518                     state = COM2;
519                     ip += runelen;
520                     runelen = 1;
521                     continue;
522 
523                 case S_EOFCOM:
524                     error(WARNING, "EOF inside comment");
525                     --ip;
526                 case S_COMMENT:
527                     if (!Cflag)
528                     {
529                         tp->t = ++ip;
530                         tp->t[-1] = ' ';
531                         tp->wslen = 1;
532                         state = START;
533                         continue;
534                     }
535                     else
536                     {
537                         runelen = 1;
538                         s->lineinc = 0;;
539                         tp->type = COMMENT;
540                         tp->flag |= XTWS;
541                     }
542             }
543             break;
544         }
545         ip += runelen;
546         runelen = 1;
547         tp->len = ip - tp->t;
548         tp++;
549     }
550 }
551 
552 /* have seen ?; handle the trigraph it starts (if any) else 0 */
553 int
trigraph(Source * s)554     trigraph(Source * s)
555 {
556     uchar c;
557 
558     while (s->inp + 2 >= s->inl && fillbuf(s) != EOF);
559     ;
560     if (s->inp[1] != '?')
561         return 0;
562     c = 0;
563     switch (s->inp[2])
564     {
565         case '=':
566             c = '#';
567             break;
568         case '(':
569             c = '[';
570             break;
571         case '/':
572             c = '\\';
573             break;
574         case ')':
575             c = ']';
576             break;
577         case '\'':
578             c = '^';
579             break;
580         case '<':
581             c = '{';
582             break;
583         case '!':
584             c = '|';
585             break;
586         case '>':
587             c = '}';
588             break;
589         case '-':
590             c = '~';
591             break;
592     }
593     if (c)
594     {
595         *s->inp = c;
596         memmove(s->inp + 1, s->inp + 3, s->inl - s->inp + 2);
597         s->inl -= 2;
598     }
599     return c;
600 }
601 
602 int
foldline(Source * s)603     foldline(Source * s)
604 {
605     int n = 1;
606 
607     /* skip pending wihite spaces */
608     while ((s->inp[n] == ' ') || (s->inp[n] == '\t'))
609     {
610         n++;
611         if ((s->inp + n >= s->inl) && (fillbuf(s) == EOF))
612             break;
613     }
614 
615     /* refill buffer */
616     while (s->inp + (n + 1) >= s->inl && fillbuf(s) != EOF);
617 
618     /* skip DOS line ends */
619     if (((s->inp[n] == '\r') && (s->inp[n+1] == '\n')) ||
620         ((s->inp[n] == '\n') && (s->inp[n+1] == '\r')))
621         n++;
622 
623     if ((s->inp[n] == '\n') || (s->inp[n] == '\r'))
624     {
625         memmove(s->inp, s->inp + n + 1, s->inl - s->inp + n + 2);
626         s->inl -= n + 1;
627         return 1;
628     }
629     return 0;
630 }
631 
632 int
fillbuf(Source * s)633     fillbuf(Source * s)
634 {
635     int n;
636 
637     if (s->fd < 0 || (n = read(s->fd, (char *) s->inl, INS / 8)) <= 0)
638         n = 0;
639     s->inl += n;
640     s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOB;
641     if (n == 0)
642     {
643         s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOFC;
644         return EOF;
645     }
646     return 0;
647 }
648 
649 /*
650  * Push down to new source of characters.
651  * If fd>0 and str==NULL, then from a file `name';
652  * if fd==-1 and str, then from the string.
653  */
654 Source *
setsource(char * name,int path,int fd,char * str,int wrap)655     setsource(char *name, int path, int fd, char *str, int wrap)
656 {
657     Source *s = new(Source);
658     int len;
659 
660     s->line = 1;
661     s->lineinc = 0;
662     s->fd = fd;
663     s->filename = name;
664     s->next = cursource;
665     s->ifdepth = 0;
666     s->pathdepth = path;
667     s->wrap = wrap;
668 
669     cursource = s;
670 
671     if (s->wrap)
672         genwrap(0);
673 
674     /* slop at right for EOB */
675     if (str)
676     {
677         len = strlen(str);
678         s->inb = domalloc(len + 4);
679         s->inp = s->inb;
680         strncpy((char *) s->inp, str, len);
681     }
682     else
683     {
684         s->inb = domalloc(INS + 4);
685         s->inp = s->inb;
686         len = 0;
687     }
688     s->inl = s->inp + len;
689     s->inl[0] = s->inl[1] = EOB;
690 
691     return s;
692 }
693 
694 void
unsetsource(void)695     unsetsource(void)
696 {
697     Source *s = cursource;
698 
699     if (s->wrap)
700         genwrap(1);
701 
702     if (s->fd >= 0)
703     {
704         close(s->fd);
705         dofree(s->inb);
706     }
707     cursource = s->next;
708     dofree(s);
709 }
710