Author: Tomas Rutkauskas
I have a list of items that I will need to deal with (looking for duplicate files
on a drive) that can be upto 2 to 5 million in size. So I will be populating this
list and then searching it over and over and over. Normally when dealing with
smaller lists like this I would simply use a TStringList and attach an object.
However, this seems a little large for TStringList and the main reason, the
searching with IndexOf, I don't think is reason enough to use it. So what I am
looking for is a list of some sort (TObjectList ?) that is fast and good to deal
with and can easily handle this size of entries. It would be really nice it there
was a way to create multiple indexes into the in-memory list, as that would greatly
speed up my processing of the information. The best solution would be an in-memory
database of some sort (at least I think it would be), but my issue with in-memory
databases is that of the String sizes for File Name and directory. If I use a
regular String variable in an object, then the size can be variable. If I use a
standard DB field, then the size in the ones I have seen are all static. So I have
to define a huge field to handle all file names which wastes space on all other
entries in the list. Any thoughts as the best container to handle this sort of
thing?
Answer:
1
2 {$A+,B-,D-,E+,F-,G+,I-,L-,N-,O-,P-,Q-,R-,S-,T-,V-,X-,Y-}
3
4 program FindDup;
5
6 {$APPTYPE CONSOLE} // 2002-12-16 - Delphi 32 console app
7
8 {Usage: FINDDUP [D:]
9
10 This program finds all duplicate files on the specified drive. The algorithm uses a
11 hash table where each hash value contains a linked list of files that match a given
12 hash. Every file found is inserted into the hash table linked list unless it's
13 already there and therefore a duplicate. All duplicates are added to a red-black
14 tree, where each node contains the file ame and a pointer to a linked list of
15 records containing size, date, and path information.
16
17 This program can be compiled in either real or protected modes. Protected mode will
18 allow it to search larger drives. Informal benchmarks on a 1.6 Gig drive with > 900
19 directories, Pentium 90, compiled for p-mode:
20
21 TEST 16 sec. (simple do-nothing recursive dir searcher)
22 FINDDUP 18 sec. - 16 = 2 sec.
23 REP-DPMI 36 sec. - 16 = 20 sec.
24
25 Subtracting the overhead of FindFirst/ FindNext, this program runs in about 2
26 seconds as compared to REP-DPMI's 20 seconds. Without ignoring the overhead, it's
27 still twice as fast. Please note that this program is pure pascal.}
28
29 {$DEFINE DRBOB} // Do not search hidden directories or find hidden files
30 {.$DEFINE SAFE}// Checks memory allocations for out of memory conditions
31 {.$DEFINE CLEANUP}// Frees allocated memory - slows things down a bit
32
33 uses
34 SysUtils;
35
36 const
37 DosDelimSet: set of Char = ['\', ':', #0];
38 MaxHash = 16381; {largest prime number < 16K}
39
40 type
41 PathStr = string;
42 NameString = string[12];
43 St2 = string[2];
44
45 pPath = ^PathRecord; {we will only keep one copy of each unique directory path}
46 PathRecord = record
47 Next: pPath;
48 Path: PathStr;
49 end;
50
51 pDataRec = ^DataRec; {detailed information unique to each dupliacte}
52 DataRec = record
53 Time: longint;
54 Size: longint;
55 Path: pPath;
56 Next: pDataRec;
57 end;
58
59 link = ^RBTreeNode;
60 RBTreeNode = record {Red/ black tree node}
61 Key: NameString; {Name of file}
62 red: boolean;
63 l, r: link;
64 DataP: pDataRec; {linked list of detail information}
65 end;
66
67 pFileRec = ^MyFileRec;
68 MyFileRec = record
69 {Hash table record. There will only be one record for each duplicate file name}
70 Name: NameString;
71 Time: longint;
72 Size: longint;
73 Path: pPath;
74 Node: link; {let's quickly insert duplicate into tree}
75 next: pFileRec; {next record in linked list}
76 end;
77
78 tHashTable = array[0..MAXHASH] of pFileRec;
79
80 var
81 Head, z: link;
82 HashTable: ^tHashTable;
83 PathHead, TempPathHead: pPath;
84 OldName: NameString;
85
86 procedure RBTreeInitialize;
87 {Initialize red/ black tree}
88 begin
89 New(Z);
90 {$IFDEF SAFE}
91 if Z = nil then
92 exit;
93 {$ENDIF}
94 z^.l := z;
95 z^.r := z;
96 z^.red := false;
97 New(Head);
98 {$IFDEF SAFE}
99 if Head = nil then
100 exit;
101 {$ENDIF}
102 Head^.r := z;
103 Head^.l := z;
104 Head^.Key := '';
105 Head^.Red := false;
106 end;
107
108 function Rotate(const Value: MyFileRec; y: link): link;
109 var
110 c, gc: link;
111 begin
112 if Value.Name < Y^.Key then
113 c := y^.l
114 else
115 c := y^.r;
116 if Value.Name < c^.Key then
117 begin
118 gc := c^.l;
119 c^.l := gc^.r;
120 gc^.r := c;
121 end
122 else
123 begin
124 gc := c^.r;
125 c^.r := gc^.l;
126 gc^.l := c;
127 end;
128 if Value.Name < Y^.Key then
129 y^.l := gc
130 else
131 y^.r := gc;
132 Rotate := gc;
133 end;
134
135 function Split(const Value: MyFileRec; gg, g, p, x: link): link;
136 begin
137 x^.red := true;
138 x^.l^.red := false;
139 x^.r^.red := false;
140 if p^.red then
141 begin
142 g^.red := true;
143 if (Value.Name < g^.Key) <> (Value.Name < p^.Key) then
144 p := Rotate(Value, g);
145 x := rotate(Value, gg);
146 x^.red := false;
147 end;
148 Head^.r^.red := false;
149 split := x;
150 end;
151
152 function RBTreeInsert(const Value: MyFileRec; x: link): link;
153 {Insert file record into red/ black tree}
154 var
155 gg, g, p: link;
156 begin
157 p := x;
158 g := x;
159 repeat
160 gg := g;
161 g := p;
162 p := x;
163 if Value.Name < x^.Key then
164 x := x^.l
165 else
166 x := x^.r;
167 if x^.l^.red and x^.r^.red then
168 x := split(Value, gg, g, p, x);
169 until
170 x = z;
171 new(x);
172 {$IFDEF SAFE}
173 if x = nil then
174 exit;
175 {$ENDIF}
176 x^.Key := Value.Name;
177 New(x^.DataP);
178 {$IFDEF SAFE}
179 if x^.DataP = nil then
180 exit;
181 {$ENDIF}
182 x^.DataP^.Next := nil;
183 x^.DataP^.Time := Value.Time;
184 x^.DataP^.Size := Value.Size;
185 x^.DataP^.Path := Value.Path;
186 x^.l := z;
187 x^.r := z;
188 if Value.Name < p^.Key then
189 p^.l := x
190 else
191 p^.r := x;
192 RbTreeInsert := x;
193 x := Split(Value, gg, g, p, x);
194 end;
195
196 procedure Traverse(p: link);
197 {Traverse red/ black tree, printing out results}
198 var
199 TempQ, q: pDataRec;
200 begin
201 if (p^.l <> z) and (p^.l <> nil) then
202 Traverse(p^.l);
203 if (p <> head) then
204 begin
205 if p^.Key <> OldName then
206 begin
207 OldName := p^.Key;
208 writeln(OldName);
209 end;
210 q := p^.DataP;
211 while q <> nil do
212 begin
213 with q^ do
214 writeln(size: 10, ' ', FormatDateTime('yyyy-mm-dd hh:nn:ss',
215 FileDateToDateTime(Time)), ' ', Path^.Path);
216 {$IFDEF CLEANUP}
217 TempQ := q;
218 q := q^.Next;
219 Dispose(TempQ);
220 {$ELSE}
221 q := q^.Next;
222 {$ENDIF}
223 end;
224 writeln;
225 end;
226 if (p^.r <> z) and (p^.r <> nil) then
227 Traverse(p^.r);
228 {$IFDEF CLEANUP}
229 Dispose(p);
230 {$ENDIF}
231 end;
232
233 function AddBackSlash(const DirName: string): string;
234 {Add a default backslash to a directory name}
235 begin
236 if DirName[Length(DirName)] in DosDelimSet then
237 AddBackSlash := DirName
238 else
239 AddBackSlash := DirName + '\';
240 end;
241
242 function Hash(const Key: NameString): word;
243 var
244 h: word;
245 j: integer;
246 Len: integer;
247 begin
248 Len := Length(Key);
249 h := ord(Key[1]);
250 for j := 2 to Len do
251 begin
252 h := ((h * 32) + Ord(Key[j])) mod MAXHASH;
253 end;
254 Hash := h;
255 end;
256
257 procedure Add(var SR: tSearchRec; DirP: pPath);
258 {Add a new search record/ path to the hash table}
259 var
260 p, q, r: pFileRec;
261 h: word;
262 TempData: pDataRec;
263 begin
264 h := Hash(SR.Name);
265 New(r);
266 {$IFDEF SAFE}
267 if r = nil then
268 exit;
269 {$ENDIF}
270 r^.Name := SR.Name;
271 r^.Time := SR.Time;
272 r^.Size := SR.Size;
273 r^.Path := DirP;
274 r^.Next := nil;
275 r^.Node := nil;
276 p := HashTable^[H];
277 if p = nil then
278 begin {Hash slot not used}
279 HashTable^[h] := r;
280 end
281 else
282 begin
283 q := p;
284 while (p <> nil) and (p^.Name < SR.Name) do
285 begin
286 q := p;
287 p := p^.Next;
288 end;
289 if (p <> nil) and (p^.Name = SR.Name) then
290 begin {Found duplicate file}
291 if p^.Node = nil then
292 begin {was not already in tree}
293 p^.Node := RBTreeInsert(p^, Head);
294 {save link so we don't have to search tree next time}
295 end;
296 New(TempData);
297 {$IFDEF SAFE}
298 if TempData = nil then
299 exit;
300 {$ENDIF}
301 TempData^.Time := Sr.Time;
302 TempData^.Size := Sr.Size;
303 TempData^.Path := DirP;
304 TempData^.Next := p^.Node^.DataP; {Add to linked list on tree node}
305 p^.Node^.DataP := TempData;
306 Dispose(r); {didn't need it after all}
307 end
308 else
309 begin {Not a duplicate}
310 if p = q then
311 begin {Add at start of linked list}
312 HashTable^[H] := r;
313 r^.Next := P;
314 end
315 else
316 begin {Insert into linked list}
317 q^.Next := r;
318 r^.Next := p;
319 end;
320 end;
321 end;
322 end;
323
324 procedure Find(const Path: PathStr);
325 {Recursive file/directory searcher}
326 var
327 Sr: tSearchRec;
328 DirP: pPath;
329 r: integer;
330 begin
331 New(DirP);
332 {$IFDEF SAFE}
333 if DirP = nil then
334 exit;
335 {$ENDIF}
336 DirP^.Path := Path;
337 DirP^.Next := PathHead;
338 PathHead := DirP;
339 r := FindFirst(AddBackSlash(Path) + '*.*', faAnyFile, Sr);
340 while r = 0 do
341 begin
342 {$IFDEF DRBOB} {only do non-hidden directories}
343 if ((Sr.Attr and faDirectory) <> 0) and ((Sr.Attr and faHidden) = 0) then
344 begin
345 {$ELSE} {do them all}
346 if (Sr.Attr and Directory) <> 0 then
347 begin
348 {$ENDIF}
349 if Sr.Name[1] <> '.' then
350 Find(AddBackSlash(Path) + Sr.Name);
351 r := 0;
352 end
353 else
354 begin
355 {$IFDEF DRBOB}
356 if (Sr.Attr and faHidden) = 0 then {Only do non-hidden files}
357 {$ENDIF}
358 Add(Sr, DirP);
359 end;
360 r := FindNext(Sr);
361 end;
362 end;
363
364 function HeapFunc(Size: Word): Integer; far;
365 begin
366 HeapFunc := 1;
367 end;
368
369 procedure Init;
370 begin
371 OldName := '';
372 PathHead := nil;
373 {$IFDEF SAFE}
374 HeapError := @HeapFunc;
375 {$ENDIF}
376 New(HashTable);
377 {$IFDEF SAFE}
378 if HashTable = nil then
379 halt;
380 {$ENDIF}
381 FillChar(HashTable^, sizeof(HashTable^), 0);
382 RBTreeInitialize;
383 end;
384
385 procedure Process;
386 begin
387 Find(ParamStr(1) + '\');
388 Traverse(Head);
389 end;
390
391 procedure Done;
392 var
393 i: integer;
394 q, tempq: pFileRec;
395 begin
396 {$IFDEF CLEANUP}
397 Dispose(Z);
398 for i := 0 to MAXHASH - 1 do
399 begin
400 if HashTable^[i] <> nil then
401 begin
402 q := HashTable^[i];
403 while q <> nil do
404 begin
405 tempq := q^.next;
406 Dispose(q);
407 q := tempq;
408 end;
409 end;
410 end;
411 Dispose(HashTable);
412 TempPathHead := PathHead;
413 while PathHead <> nil do
414 begin
415 TempPathHead := PathHead^.Next;
416 FreeMem(PathHead, Length(PathHead^.Path) + 5);
417 PathHead := TempPathHead;
418 end;
419 {$ENDIF}
420 end;
421
422 begin
423 Init;
424 Process;
425 Done;
426 end.
|