Author: Lou Adler
Does anybody know the name of the routine used in the IDE that enables control
characters embedded into a string to be recognised. For example the characters
'Line 1'#13#10'Line 2' are recognised by the compiler as a single string literal.
I'd like to build a similar facility into an application. Is a single routine used
for this or is it embedded somewhere in the parser of the compiler?
Answer:
I'm pretty sure that the compiler uses an internal routine for this which is not
accessible to us mere mortals. Perhaps you can extract something useful from the
unit below. I wrote it to extract string property values from DFM files. It is a
work in progress, so if you want to use it for the same purpose be aware that you
may have DFMs it will not be able to digest without modifications.
1 unit DFMParser;
2
3 interface
4
5 uses
6 classes, sysutils;
7
8 type
9 TBaseParser = class
10 private
11 FText: string;
12 FCurrent, FAnchor: Integer;
13 FToken: string;
14 protected
15 procedure Error(const S: string); overload;
16 procedure Error(const fmt: string; const A: array of const); overload;
17 procedure DropAnchor;
18 procedure NextToken;
19 procedure NextChar;
20 procedure SkipWhitespace;
21 procedure SkipToEol;
22 procedure SkipTo(ch: Char);
23 procedure SkipToString(const S: string);
24 function EndOfText: Boolean;
25 function IsTokenChar: Boolean;
26 function IsWhiteSpace: Boolean;
27 function CurrentChar: Char;
28 function LastWord: string;
29 function ParseEncodedChar: Char;
30 function ParseQuotedString: string;
31 function ParseStringValue: string;
32 public
33 procedure Parse; virtual; abstract;
34 constructor Create(const S: string); virtual;
35 property Token: string read FToken;
36 end;
37
38 TParsePropertyEvent = procedure(const aComponentName, aPropertyName,
39 aPropertyValue: string) of object;
40
41 TDFMParser = class(TBaseParser)
42 private
43 FParsePropertyEvent: TParsePropertyEvent;
44 protected
45 procedure ParseComponent;
46 procedure ParseProperty(const componentName: string);
47 procedure ParsePropertyString(const componentName, propertyName: string);
48 function TokenIsObject: Boolean;
49 function IsEndToken: Boolean;
50 procedure DoPropertyEvent(const componentName, propertyname, propvalue: string);
51 public
52 procedure Parse; override;
53 property OnParseProperty: TParsePropertyEvent read FParsePropertyEvent
54 write FParsePropertyEvent;
55 end;
56
57 EDFMParserError = class(Exception);
58
59 TTranslationItemEvent = procedure(const name, value: string) of object;
60
61 TTranslationParser = class(TBaseParser)
62 private
63 FTranslationItemEvent: TTranslationItemEvent;
64 procedure ParseStringConstant;
65 procedure SkipWhitespaceAndComments;
66 procedure DoTranslationItem(const name, value: string);
67 public
68 constructor Create(const S: string); override;
69 procedure Parse; override;
70 property OnTranslationItem: TTranslationItemEvent read FTranslationItemEvent
71 write FTranslationItemEvent;
72 end;
73
74 implementation
75
76 uses
77 charsets;
78
79 const
80 quote = '''';
81
82 constructor TBaseParser.Create(const S: string);
83 begin
84 FText := S;
85 FCurrent := 1;
86 end;
87
88 function TBaseParser.CurrentChar: Char;
89 begin
90 result := FText[FCurrent];
91 end;
92
93 procedure TBaseParser.DropAnchor;
94 begin
95 FAnchor := FCurrent;
96 end;
97
98 function TBaseParser.EndOfText: Boolean;
99 begin
100 result := FCurrent > Length(FText);
101 end;
102
103 procedure TBaseParser.Error(const S: string);
104 begin
105 raise EPArserError.Create(S);
106 end;
107
108 procedure TBaseParser.Error(const fmt: string; const A: array of const);
109 begin
110 Error(Format(fmt, A));
111 end;
112
113 function TBaseParser.IsTokenChar: Boolean;
114 begin
115 result := (Currentchar in Charsets.IdentifierChars) or (CurrentChar = '.');
116 end;
117
118 function TBaseParser.IsWhiteSpace: Boolean;
119 begin
120 result := Currentchar in [#1..#32];
121 end;
122
123 function TBaseParser.LastWord: string;
124 begin
125 Assert(FAnchor <= FCurrent);
126 result := Copy(FText, FAnchor, FCurrent - FAnchor);
127 end;
128
129 procedure TBaseParser.NextChar;
130 begin
131 Inc(FCurrent);
132 if EndOfText then
133 Error('Unexpected end of text');
134 end;
135
136 procedure TBaseParser.NextToken;
137 begin
138 SkipWhitespace;
139 DropAnchor;
140 while not EndOfText and IsTokenChar do
141 Inc(FCurrent);
142 FToken := LastWord;
143 end;
144
145 procedure TBaseParser.SkipTo(ch: Char);
146 begin
147 while not EndOfText and (Currentchar <> ch) do
148 NextChar;
149 Inc(FCurrent);
150 end;
151
152 procedure TBaseParser.SkipToString(const S: string);
153 var
154 P: PChar;
155 begin
156 p := StrPos(@FText[FCurrent], Pchar(S));
157 if Assigned(p) then
158 FCurrent := p - PChar(FText) + 1 + Length(S)
159 else
160 Error('Expected string "%s" not found', [s]);
161 end;
162
163 procedure TBaseParser.SkipToEol;
164 begin
165 while not EndOfText and (FText[FCurrent] <> #10) do
166 Inc(FCurrent);
167 end;
168
169 procedure TBaseParser.SkipWhitespace;
170 begin
171 while not EndOfText and IsWhiteSpace do
172 Inc(FCurrent);
173 end;
174
175 function TBaseParser.ParseQuotedString: string;
176 begin
177 Assert(CurrentChar = quote);
178 Result := '';
179 repeat
180 NextChar; {skip leading quote}
181 DropAnchor;
182 while CurrentChar <> quote do
183 NextChar;
184 Result := Result + LastWord;
185 NextChar;
186 if CurrentChar = quote then
187 Result := Result + quote; {literal quote}
188 until
189 CurrentChar <> quote;
190 SkipWhitespace;
191 end;
192
193 function TBaseParser.ParseEncodedChar: Char;
194 var
195 allowed: Charsets.TCharset;
196 n: Integer;
197 begin
198 Assert(CurrentChar = '#');
199 NextChar;
200 DropAnchor;
201 if CurrentChar = '$' then
202 begin
203 allowed := CHarsets.HexNumerals;
204 NextChar;
205 end
206 else
207 allowed := Charsets.IntegerChars;
208 while CurrentChar in allowed do
209 NextChar;
210 n := StrToInt(LastWord);
211 if n > High(Byte) then
212 Error('Encountered UNICODE character in string, cannot handle that.');
213 Result := Char(n);
214 end;
215
216 function TBaseParser.ParseStringValue: string;
217 begin
218 Result := '';
219 while True do
220 case CurrentChar of
221 quote:
222 Result := Result + ParseQuotedString;
223 '#':
224 Result := Result + ParseEncodedChar;
225 '+':
226 begin
227 NextChar;
228 SkipWhitespace;
229 end;
230 else
231 Break;
232 end;
233 end;
234
235 { TDFMParser }
236
237 procedure TDFMParser.DoPropertyEvent(const componentName, propertyname, propvalue:
238 string);
239 begin
240 if Assigned(FParsePropertyEvent) then
241 FParsePropertyEvent(componentName, propertyname, propvalue);
242 end;
243
244 function TDFMParser.IsEndToken: Boolean;
245 begin
246 result := Token = 'end';
247 end;
248
249 procedure TDFMParser.Parse;
250 begin
251 while not EndOfText do
252 begin
253 ParseComponent;
254 SkipWhitespace;
255 end;
256 end;
257
258 procedure TDFMParser.ParseComponent;
259 var
260 componentName: string;
261 begin
262 if FToken = '' then
263 NextToken;
264 if not TokenIsObject then
265 Error('Expected: inherited or object, found : %s', [Token]);
266 NextToken;
267 componentName := Token;
268 SkipToEol;
269 repeat
270 NextToken;
271 if TokenIsObject then
272 ParseComponent
273 else if not IsEndToken then
274 ParseProperty(componentName);
275 until
276 IsEndToken or EndOfText;
277 if IsEndToken then
278 FToken := '';
279 end;
280
281 procedure TDFMParser.ParseProperty(const componentName: string);
282 var
283 propname: string;
284 begin
285 propname := Token;
286 SkipWhitespace;
287 if CurrentChar <> '=' then
288 Error('Expected: =, found %s', [Currentchar]);
289 NextChar;
290 SkipWhitespace;
291 case CurrentChar of
292 '{':
293 SkipTo('}');
294 '(':
295 SkipTo(')');
296 '[':
297 SkipTo(']');
298 quote, '#':
299 ParsePropertyString(componentName, propname);
300 else
301 SkipToEol
302 end;
303 end;
304
305 procedure TDFMParser.ParsePropertyString(const componentName, propertyName: string);
306 var
307 propvalue: string;
308 begin
309 propvalue := ParseStringValue;
310 if propvalue <> '' then
311 DoPropertyEvent(componentName, propertyname, propvalue);
312 end;
313
314 function TDFMParser.TokenIsObject: Boolean;
315 begin
316 Result := (Token = 'inherited') or (Token = 'object')
317 end;
318
319 { TTranslationParser }
320
321 constructor TTranslationParser.Create(const S: string);
322 const
323 resStr = 'resourcestring';
324 var
325 lS: string;
326 resourceStringPos: Integer;
327 n1, n2: Integer;
328 begin
329 {Isolate the resourcestring section. We expect only one}
330 lS := LowerCase(S);
331 resourceStringPos := Pos(resStr, lS);
332 if resourceStringPos = 0 then
333 inherited Create('')
334 else
335 begin
336 {look for an $ifdef german}
337 n1 := Pos('{$ifdef german', lS);
338 if n1 > 0 then
339 begin
340 {look for the following $else}
341 Delete(lS, 1, n1 - 1);
342 n2 := Pos('{$else}', lS);
343 if n2 = 0 then
344 Error('Malformed $IFDEF...$ELSE encountered, $ELSE not found');
345 Delete(lS, 1, n2 - 1);
346 Inc(n1, n2 - 1);
347 {look for the $ENDIF}
348 n2 := Pos('{$endif}', lS);
349 if n2 = 0 then
350 Error('Malformed $IFDEF...$ENDIF encountered, $ENDIF not found');
351 inherited Create(Copy(S, n1, n2 - 1));
352 end
353 else
354 begin
355 {look for an $ifndef german}
356 n1 := Pos('{$ifndef german', lS);
357 if n1 = 0 then
358 inherited Create('')
359 else
360 begin
361 {in the $ifndef german construct the resourcestring keyword often comes
362 after the $ifndef.}
363 if n1 < resourceStringPos then
364 n1 := resourceStringPos + Length(resstr);
365 Delete(lS, 1, n1 - 1);
366 {look for the $ENDIF}
367 n2 := Pos('{$endif}', lS);
368 if n2 = 0 then
369 Error('Malformed $IFDEF...$ENDIF encountered, $ENDIF not found');
370 inherited Create(Copy(S, n1, n2 - 1));
371 end;
372 end;
373 end;
374 end;
375
376 procedure TTranslationParser.DoTranslationItem(const name, value: string);
377 begin
378 if Assigned(FTranslationItemEvent) then
379 FTranslationItemEvent(name, value);
380 end;
381
382 procedure TTranslationParser.Parse;
383 begin
384 while not EndOfText do
385 begin
386 ParseStringConstant;
387 SkipWhitespace;
388 end;
389 end;
390
391 procedure TTranslationParser.ParseStringConstant;
392 var
393 name, value: string;
394 begin
395 SkipWhitespaceAndComments;
396 if EndOfText then
397 Exit;
398 NextToken;
399 name := Token;
400 SkipWhitespaceAndComments;
401 if EndOfText then
402 Exit;
403 if CurrentChar <> '=' then
404 Error('Expected: =, found "%s"', [CurrentChar]);
405 NextChar;
406 SkipWhitespaceAndComments;
407 if EndOfText then
408 Exit;
409 value := ParseStringValue;
410 SkipWhiteSpace;
411 if not EndOfText and (CurrentChar = ';') then
412 NextChar;
413 DoTranslationItem(name, value);
414 end;
415
416 procedure TTranslationParser.SkipWhitespaceAndComments;
417 begin
418 while True do
419 begin
420 SkipWhitespace;
421 if not EndOfText then
422 begin
423 case CurrentChar of
424 '/':
425 SkipToEol; { single line comment }
426 '{':
427 SkipTo('}'); { comment }
428 '(':
429 begin
430 NextChar;
431 if CurrentChar = '*' then
432 SkipToString('*)')
433 else
434 Error('Expected: comment or indentifier, found: "(%s"',
435 [CurrentChar]);
436 end;
437 else
438 Break
439 end;
440 end
441 else
442 Break;
443 end;
444 end;
445
446 end.
447
448 unit Charsets;
449
450 interface
451
452 type
453 TCharSet = set of AnsiChar;
454 const
455 Signs: TCharset = ['-', '+'];
456 Numerals: TCharset = ['0'..'9'];
457 HexNumerals: TCharset = ['A'..'F', 'a'..'f', '0'..'9'];
458 IntegerChars: TCharset = ['0'..'9', '-', '+'];
459 IdentifierChars: TCharset = ['a'..'z', 'A'..'Z', '0'..'9', '_'];
460 var
461 Digits, Letters, LowerCaseLetters, UpperCaseLetters: TCharSet;
462 FloatChars, SciFloatChars: TCharset;
463 AlphaNum, NonAlphaNum: TCharset;
464
465 { Need to call this again when locale changes. }
466 procedure SetupCharsets;
467
468 implementation
469
470 uses
471 Windows, Sysutils;
472
473 var
474 locale: DWORD = 0;
475
476 procedure SetupCharsets;
477 var
478 ch: AnsiChar;
479 begin
480 if locale = GetThreadLocale then
481 Exit
482 else
483 Locale := GetThreadLocale;
484 LowerCaseLetters := [];
485 UpperCaseLetters := [];
486 AlphaNum := [];
487 NonAlphaNum := [];
488 Digits := Numerals;
489 for ch := Low(ch) to High(ch) do
490 begin
491 if IsCharAlpha(ch) then
492 if IsCharUpper(ch) then
493 Include(UpperCaseLetters, ch)
494 else
495 Include(LowerCaseLetters, ch);
496 if IsCharAlphanumeric(ch) then
497 Include(AlphaNum, ch)
498 else
499 Include(NonAlphaNum, ch);
500 end;
501 Letters := LowerCaseLetters + UpperCaseLetters;
502 FloatChars := IntegerChars;
503 Include(FloatChars, DecimalSeparator);
504 SciFloatChars := FloatChars + ['e', 'E'];
505 end;
506
507 initialization
508 SetupCharsets;
509 end.
|