437b33fde3ced9f56111f33defba0f459ec51cd8
[m6w6/ext-psi] / src / parser_scan.re
1 /*******************************************************************************
2 Copyright (c) 2016, Michael Wallner <mike@php.net>.
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13
14 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
18 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 *******************************************************************************/
25
26 #ifdef HAVE_CONFIG_H
27 # include "config.h"
28 #else
29 # include "php_config.h"
30 #endif
31
32 #include "parser.h"
33 #include "plist.h"
34 #include "debug.h"
35
36 /*!max:re2c*/
37 #ifndef YYMAXFILL
38 # define YYMAXFILL 256
39 #endif
40
41 size_t psi_parser_maxfill(void) {
42 return YYMAXFILL;
43 }
44
45 #define NEWLINE() \
46 eol = cur; \
47 ++lines
48
49 #define NEWTOKEN(t) do { \
50 if (t == PSI_T_COMMENT || t == PSI_T_WHITESPACE) { \
51 token = psi_token_init(t, "", 0, tok - eol + 1, lines, I->file); \
52 } else { \
53 token = psi_token_init(t, tok, cur - tok, tok - eol + 1, lines, I->file); \
54 } \
55 tokens = psi_plist_add(tokens, &token); \
56 PSI_DEBUG_LOCK(P, \
57 PSI_DEBUG_PRINT(P, "PSI: scanned < "); \
58 PSI_DEBUG_DUMP(P, psi_token_dump, token); \
59 ); \
60 } while(0)
61
62 #define CHECKEOF() if (cur >= lim - YYMAXFILL) goto done
63
64 struct psi_plist *psi_parser_scan(struct psi_parser *P, struct psi_parser_input *I)
65 {
66 struct psi_plist *tokens;
67 struct psi_token *token;
68 const char *tok, *cur, *lim, *mrk, *eol, *ctxmrk;
69 unsigned parens, lines = 1;
70 bool escaped;
71 token_t char_width;
72
73 PSI_DEBUG_PRINT(P, "PSI: scanning %s\n", I->file->val);
74
75 tok = mrk = eol = cur = I->buffer;
76 lim = I->buffer + I->length + YYMAXFILL;
77 tokens = psi_plist_init((psi_plist_dtor) psi_token_free);
78
79 start: ;
80 char_width = 1;
81 ctxmrk = NULL;
82 tok = cur;
83
84 (void) ctxmrk;
85
86 /*!re2c
87
88 re2c:indent:top = 2;
89 re2c:define:YYCTYPE = "unsigned char";
90 re2c:define:YYCURSOR = cur;
91 re2c:define:YYLIMIT = lim;
92 re2c:define:YYMARKER = mrk;
93 re2c:define:YYCTXMARKER = ctxmrk;
94 re2c:define:YYFILL = "CHECKEOF();";
95 re2c:yyfill:parameter = 0;
96
97 W = [a-zA-Z0-9_\x80-\xff];
98 SP = [ \t\f];
99 EOL = [\r\n];
100 NAME = [a-zA-Z_\x80-\xff] W*;
101 NSNAME = (NAME)? ("\\" NAME)+;
102 DOLLAR_NAME = '$' W+;
103 CPP_HEADER = "<" [-._/a-zA-Z0-9]+ ">";
104 CPP_ATTRIBUTE = "__attribute__" SP* "((";
105
106 DEC_CONST = [1-9] [0-9]*;
107 OCT_CONST = "0" [0-7]*;
108 HEX_CONST = '0x' [0-9a-fA-F]+;
109 INT_CONST = (DEC_CONST | OCT_CONST | HEX_CONST);
110
111 FLT_HEX_CONST = HEX_CONST ("." [0-9a-fA-F]*)? 'p' [+-]? [0-9]+;
112 FLT_DEC_NUM = "0" | DEC_CONST;
113 FLT_DEC_CONST = (FLT_DEC_NUM ("." [0-9]*)? 'e' [+-]? [0-9]+) | (FLT_DEC_NUM "." [0-9]*) | ("." [0-9]+);
114 FLT_CONST = (FLT_DEC_CONST | FLT_HEX_CONST);
115
116 INT_CONST { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_INT; goto start; }
117 INT_CONST / 'u' { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_INT | PSI_NUMBER_U; cur += 1; goto start; }
118 INT_CONST / 'l' { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_INT | PSI_NUMBER_L; cur += 1; goto start; }
119 INT_CONST / ('lu' | 'ul') { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_INT | PSI_NUMBER_UL; cur += 2; goto start; }
120 INT_CONST / ('llu' | 'ull') { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_INT | PSI_NUMBER_ULL; cur += 3; goto start; }
121
122 FLT_CONST { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_FLT; goto start; }
123 FLT_CONST / 'f' { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_FLT | PSI_NUMBER_F; cur += 1; goto start; }
124 FLT_CONST / 'l' { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_FLT | PSI_NUMBER_L; cur += 1; goto start; }
125 FLT_CONST / 'df' { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_FLT | PSI_NUMBER_DF; cur += 2; goto start; }
126 FLT_CONST / 'dd' { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_FLT | PSI_NUMBER_DD; cur += 2; goto start; }
127 FLT_CONST / 'dl' { NEWTOKEN(PSI_T_NUMBER); token->flags = PSI_NUMBER_FLT | PSI_NUMBER_DL; cur += 2; goto start; }
128
129 "'" { escaped = false; tok += 1; goto character; }
130 "\"" { escaped = false; tok += 1; goto string; }
131 "u8" / "\"" { char_width = 1; }
132 "u" / ['"] { char_width = 2; }
133 "U" / ['"] { char_width = 4; }
134 "L" / ['"] { char_width = sizeof(wchar_t); }
135
136 "/*" { goto comment; }
137 "//" { goto comment_sl; }
138
139 "##" { NEWTOKEN(PSI_T_CPP_PASTE); goto start; }
140 "#" { NEWTOKEN(PSI_T_HASH); goto start; }
141 "(" { NEWTOKEN(PSI_T_LPAREN); goto start; }
142 ")" { NEWTOKEN(PSI_T_RPAREN); goto start; }
143 ";" { NEWTOKEN(PSI_T_EOS); goto start; }
144 "," { NEWTOKEN(PSI_T_COMMA); goto start; }
145 ":" { NEWTOKEN(PSI_T_COLON); goto start; }
146 "{" { NEWTOKEN(PSI_T_LBRACE); goto start; }
147 "}" { NEWTOKEN(PSI_T_RBRACE); goto start; }
148 "[" { NEWTOKEN(PSI_T_LBRACKET); goto start; }
149 "]" { NEWTOKEN(PSI_T_RBRACKET); goto start; }
150 "!=" { NEWTOKEN(PSI_T_CMP_NE); goto start; }
151 "==" { NEWTOKEN(PSI_T_CMP_EQ); goto start; }
152 "&&" { NEWTOKEN(PSI_T_AND); goto start; }
153 "||" { NEWTOKEN(PSI_T_OR); goto start; }
154 "=" { NEWTOKEN(PSI_T_EQUALS); goto start; }
155 "*" { NEWTOKEN(PSI_T_ASTERISK); goto start; }
156 "~" { NEWTOKEN(PSI_T_TILDE); goto start; }
157 "!" { NEWTOKEN(PSI_T_NOT); goto start; }
158 "%" { NEWTOKEN(PSI_T_MODULO); goto start; }
159 "&" { NEWTOKEN(PSI_T_AMPERSAND); goto start; }
160 "+" { NEWTOKEN(PSI_T_PLUS); goto start; }
161 "-" { NEWTOKEN(PSI_T_MINUS); goto start; }
162 "/" { NEWTOKEN(PSI_T_SLASH); goto start; }
163 "\\" { NEWTOKEN(PSI_T_BSLASH); goto start; }
164 "|" { NEWTOKEN(PSI_T_PIPE); goto start; }
165 "^" { NEWTOKEN(PSI_T_CARET); goto start; }
166 "<<" { NEWTOKEN(PSI_T_LSHIFT); goto start; }
167 ">>" { NEWTOKEN(PSI_T_RSHIFT); goto start; }
168 "<=" { NEWTOKEN(PSI_T_CMP_LE); goto start; }
169 ">=" { NEWTOKEN(PSI_T_CMP_GE); goto start; }
170 "<" { NEWTOKEN(PSI_T_LCHEVR); goto start; }
171 ">" { NEWTOKEN(PSI_T_RCHEVR); goto start; }
172 "." { NEWTOKEN(PSI_T_PERIOD); goto start; }
173 "..." { NEWTOKEN(PSI_T_ELLIPSIS); goto start; }
174 "?" { NEWTOKEN(PSI_T_IIF); goto start; }
175 "pragma" { NEWTOKEN(PSI_T_PRAGMA); goto start; }
176 "pragma" W+ "once" { NEWTOKEN(PSI_T_PRAGMA_ONCE); goto start; }
177 "__"? "inline" { NEWTOKEN(PSI_T_CPP_INLINE); goto start; }
178 "__restrict" { NEWTOKEN(PSI_T_CPP_RESTRICT); goto start; }
179 "__extension__" { NEWTOKEN(PSI_T_CPP_EXTENSION); goto start; }
180 "__asm" ("__")? { NEWTOKEN(PSI_T_CPP_ASM); goto start; }
181 "volatile" { NEWTOKEN(PSI_T_VOLATILE); goto start; }
182 "sizeof" { NEWTOKEN(PSI_T_SIZEOF); goto start; }
183 "line" { NEWTOKEN(PSI_T_LINE); goto start; }
184 "typedef" { NEWTOKEN(PSI_T_TYPEDEF); goto start; }
185 "struct" { NEWTOKEN(PSI_T_STRUCT); goto start; }
186 "union" { NEWTOKEN(PSI_T_UNION); goto start; }
187 "enum" { NEWTOKEN(PSI_T_ENUM); goto start; }
188 "const" { NEWTOKEN(PSI_T_CONST); goto start; }
189 "void" { NEWTOKEN(PSI_T_VOID); goto start; }
190 "bool" { NEWTOKEN(PSI_T_BOOL); goto start; }
191 "char" { NEWTOKEN(PSI_T_CHAR); goto start; }
192 "short" { NEWTOKEN(PSI_T_SHORT); goto start; }
193 "int" { NEWTOKEN(PSI_T_INT); goto start; }
194 "long" { NEWTOKEN(PSI_T_LONG); goto start; }
195 "float" { NEWTOKEN(PSI_T_FLOAT); goto start; }
196 "double" { NEWTOKEN(PSI_T_DOUBLE); goto start; }
197 "unsigned" { NEWTOKEN(PSI_T_UNSIGNED); goto start; }
198 "signed" { NEWTOKEN(PSI_T_SIGNED); goto start; }
199 'IF' { NEWTOKEN(PSI_T_IF); goto start; }
200 'IFDEF' { NEWTOKEN(PSI_T_IFDEF); goto start; }
201 'IFNDEF' { NEWTOKEN(PSI_T_IFNDEF); goto start; }
202 'ELSE' { NEWTOKEN(PSI_T_ELSE); goto start; }
203 'ELIF' { NEWTOKEN(PSI_T_ELIF); goto start; }
204 'ENDIF' { NEWTOKEN(PSI_T_ENDIF); goto start; }
205 'DEFINE' { NEWTOKEN(PSI_T_DEFINE); goto start; }
206 'DEFINED' { NEWTOKEN(PSI_T_DEFINED); goto start; }
207 'UNDEF' { NEWTOKEN(PSI_T_UNDEF); goto start; }
208 'WARNING' { NEWTOKEN(PSI_T_WARNING); goto start; }
209 'ERROR' { NEWTOKEN(PSI_T_ERROR); goto start; }
210 'INCLUDE' { NEWTOKEN(PSI_T_INCLUDE); goto start; }
211 'INCLUDE_NEXT' { NEWTOKEN(PSI_T_INCLUDE_NEXT); goto start; }
212 'TRUE' { NEWTOKEN(PSI_T_TRUE); goto start; }
213 'FALSE' { NEWTOKEN(PSI_T_FALSE); goto start; }
214 'NULL' { NEWTOKEN(PSI_T_NULL); goto start; }
215 'MIXED' { NEWTOKEN(PSI_T_MIXED); goto start; }
216 'CALLABLE' { NEWTOKEN(PSI_T_CALLABLE); goto start; }
217 'STRING' { NEWTOKEN(PSI_T_STRING); goto start; }
218 'ARRAY' { NEWTOKEN(PSI_T_ARRAY); goto start; }
219 'OBJECT' { NEWTOKEN(PSI_T_OBJECT); goto start; }
220 'CALLBACK' { NEWTOKEN(PSI_T_CALLBACK); goto start; }
221 'STATIC' { NEWTOKEN(PSI_T_STATIC); goto start; }
222 'FUNCTION' { NEWTOKEN(PSI_T_FUNCTION); goto start; }
223 'LIB' { NEWTOKEN(PSI_T_LIB); goto start; }
224 'LET' { NEWTOKEN(PSI_T_LET); goto start; }
225 'SET' { NEWTOKEN(PSI_T_SET); goto start; }
226 'PRE_ASSERT' { NEWTOKEN(PSI_T_PRE_ASSERT); goto start; }
227 'POST_ASSERT' { NEWTOKEN(PSI_T_POST_ASSERT); goto start; }
228 'RETURN' { NEWTOKEN(PSI_T_RETURN); goto start; }
229 'AS' { NEWTOKEN(PSI_T_AS); goto start; }
230 'FREE' { NEWTOKEN(PSI_T_FREE); goto start; }
231 'TEMP' { NEWTOKEN(PSI_T_TEMP); goto start; }
232 'STRLEN' { NEWTOKEN(PSI_T_STRLEN); goto start; }
233 'STRVAL' { NEWTOKEN(PSI_T_STRVAL); goto start; }
234 'PATHVAL' { NEWTOKEN(PSI_T_PATHVAL); goto start; }
235 'INTVAL' { NEWTOKEN(PSI_T_INTVAL); goto start; }
236 'FLOATVAL' { NEWTOKEN(PSI_T_FLOATVAL); goto start; }
237 'BOOLVAL' { NEWTOKEN(PSI_T_BOOLVAL); goto start; }
238 'ARRVAL' { NEWTOKEN(PSI_T_ARRVAL); goto start; }
239 'OBJVAL' { NEWTOKEN(PSI_T_OBJVAL); goto start; }
240 'ZVAL' { NEWTOKEN(PSI_T_ZVAL); goto start; }
241 'COUNT' { NEWTOKEN(PSI_T_COUNT); goto start; }
242 'CALLOC' { NEWTOKEN(PSI_T_CALLOC); goto start; }
243 'TO_OBJECT' { NEWTOKEN(PSI_T_TO_OBJECT); goto start; }
244 'TO_ARRAY' { NEWTOKEN(PSI_T_TO_ARRAY); goto start; }
245 'TO_STRING' { NEWTOKEN(PSI_T_TO_STRING); goto start; }
246 'TO_INT' { NEWTOKEN(PSI_T_TO_INT); goto start; }
247 'TO_FLOAT' { NEWTOKEN(PSI_T_TO_FLOAT); goto start; }
248 'TO_BOOL' { NEWTOKEN(PSI_T_TO_BOOL); goto start; }
249 NAME { NEWTOKEN(PSI_T_NAME); goto start; }
250 NSNAME { NEWTOKEN(PSI_T_NSNAME); goto start; }
251 DOLLAR_NAME { NEWTOKEN(PSI_T_DOLLAR_NAME); goto start; }
252 CPP_HEADER { tok += 1; cur -= 1; NEWTOKEN(PSI_T_CPP_HEADER); cur += 1; goto start; }
253 CPP_ATTRIBUTE { parens = 2; goto cpp_attribute; }
254 EOL { NEWTOKEN(PSI_T_EOL); NEWLINE(); goto start; }
255 SP+ { NEWTOKEN(PSI_T_WHITESPACE); goto start; }
256 [^] { CHECKEOF(); NEWTOKEN(-2); goto error; }
257 * { CHECKEOF(); NEWTOKEN(-1); goto error; }
258
259 */
260
261 character: ;
262 /*!re2c
263
264 EOL { NEWLINE(); goto character; }
265 "\\" { escaped = !escaped; goto character; }
266 "'" {
267 if (escaped) {
268 escaped = false;
269 goto character;
270 }
271 cur -= 1;
272 NEWTOKEN(PSI_T_QUOTED_CHAR);
273 cur += 1;
274 token->flags = char_width;
275 goto start;
276 }
277 * { escaped = false; goto character; }
278
279 */
280
281 string: ;
282 /*!re2c
283
284 EOL { NEWLINE(); goto string; }
285 "\\" { escaped = !escaped; goto string; }
286 "\"" {
287 if (escaped) {
288 escaped = false;
289 goto string;
290 }
291 cur -= 1;
292 NEWTOKEN(PSI_T_QUOTED_STRING);
293 cur += 1;
294 token->flags = char_width;
295 goto start;
296 }
297 * { escaped = false; goto string; }
298
299 */
300
301 comment: ;
302 /*!re2c
303
304 EOL { NEWLINE(); goto comment; }
305 "*" "/" { NEWTOKEN(PSI_T_COMMENT); goto start; }
306 * { goto comment; }
307
308 */
309
310 comment_sl: ;
311 /*!re2c
312
313 EOL { NEWTOKEN(PSI_T_COMMENT); tok = cur - 1; NEWTOKEN(PSI_T_EOL); NEWLINE(); goto start; }
314 * { goto comment_sl; }
315
316 */
317
318 cpp_attribute: ;
319
320 /*!re2c
321
322 "(" { ++parens; goto cpp_attribute; }
323 ")" { if (parens == 1) { NEWTOKEN(PSI_T_CPP_ATTRIBUTE); goto start; } else { --parens; goto cpp_attribute; } }
324 EOL { NEWLINE(); goto cpp_attribute; }
325 * { goto cpp_attribute; }
326
327 */
328 error: ;
329
330 P->error(PSI_DATA(P), token, PSI_WARNING, "PSI syntax error: unexpected input (%d) '%.*s' at col %tu",
331 token->type, token->text->len, token->text->val, tok - eol + 1);
332 psi_plist_free(tokens);
333 return NULL;
334
335 done: ;
336
337 PSI_DEBUG_PRINT(P, "PSI: EOF cur=%p lim=%p\n", cur, lim);
338
339 return tokens;
340 }