1 |
#! /bin/sh /usr/share/dpatch/dpatch-run |
2 |
## 05_lucenedemo.dpatch by <gregor+debian@comodo.priv.at> |
3 |
## |
4 |
## All lines beginning with `## DP:' are a description of the patch. |
5 |
## DP: Work around missing lucene demo. |
6 |
|
7 |
@DPATCH@ |
8 |
diff -urNad libpdfbox-java~/src/org/pdfbox/searchengine/lucene/IndexFiles.java libpdfbox-java/src/org/pdfbox/searchengine/lucene/IndexFiles.java |
9 |
--- libpdfbox-java~/src/org/pdfbox/searchengine/lucene/IndexFiles.java 2007-05-17 22:34:43.000000000 +0200 |
10 |
+++ libpdfbox-java/src/org/pdfbox/searchengine/lucene/IndexFiles.java 2007-09-07 15:33:43.000000000 +0200 |
11 |
@@ -61,9 +61,9 @@ |
12 |
|
13 |
import org.apache.lucene.analysis.standard.StandardAnalyzer; |
14 |
|
15 |
-import org.apache.lucene.demo.HTMLDocument; |
16 |
- |
17 |
+import org.apache.lucene.ant.HtmlDocument; |
18 |
import org.apache.lucene.document.Document; |
19 |
+import org.apache.lucene.document.DateField; |
20 |
|
21 |
import org.apache.lucene.index.IndexReader; |
22 |
import org.apache.lucene.index.IndexWriter; |
23 |
@@ -207,7 +207,7 @@ |
24 |
while (uidIter.term() != null && uidIter.term().field().equals( "uid" ) ) |
25 |
{ |
26 |
System.out.println("deleting " + |
27 |
- HTMLDocument.uid2url(uidIter.term().text())); |
28 |
+ uid2url(uidIter.term().text())); |
29 |
reader.deleteDocuments(uidIter.term()); |
30 |
uidIter.next(); |
31 |
} |
32 |
@@ -240,7 +240,7 @@ |
33 |
{ |
34 |
if (uidIter != null) |
35 |
{ |
36 |
- String uid = HTMLDocument.uid(file); // construct uid for doc |
37 |
+ String uid = uid(file); // construct uid for doc |
38 |
|
39 |
while( uidIter.term() != null && |
40 |
uidIter.term().field().equals( "uid" ) && |
41 |
@@ -249,7 +249,7 @@ |
42 |
if (deleting) |
43 |
{ // delete stale docs |
44 |
System.out.println("deleting " + |
45 |
- HTMLDocument.uid2url(uidIter.term().text())); |
46 |
+ uid2url(uidIter.term().text())); |
47 |
reader.deleteDocuments(uidIter.term()); |
48 |
} |
49 |
uidIter.next(); |
50 |
@@ -287,7 +287,7 @@ |
51 |
path.endsWith(".TXT")) |
52 |
{ |
53 |
System.out.println( "Indexing Text document: " + file ); |
54 |
- doc = HTMLDocument.Document(file); |
55 |
+ doc = HtmlDocument.Document(file); |
56 |
} |
57 |
else if( path.endsWith( ".PDF" ) ) |
58 |
{ |
59 |
@@ -304,4 +304,27 @@ |
60 |
writer.addDocument(doc); |
61 |
} |
62 |
} |
63 |
-} |
64 |
\ No newline at end of file |
65 |
+ |
66 |
+ |
67 |
+ /* |
68 |
+ * The following 2 methods are taken from the |
69 |
+ * org.apache.lucene.demo.HTMLDocument class shipped with |
70 |
+ * Lucene 1.4.3. |
71 |
+ */ |
72 |
+ private static char dirSep = System.getProperty("file.separator").charAt(0); |
73 |
+ |
74 |
+ private static String uid(File f) { |
75 |
+ // Append path and date into a string in such a way that lexicographic |
76 |
+ // sorting gives the same results as a walk of the file hierarchy. Thus |
77 |
+ // null (\u0000) is used both to separate directory components and to |
78 |
+ // separate the path from the date. |
79 |
+ return f.getPath().replace(dirSep, '\u0000') + |
80 |
+ "\u0000" + |
81 |
+ DateField.timeToString(f.lastModified()); |
82 |
+ } |
83 |
+ |
84 |
+ private static String uid2url(String uid) { |
85 |
+ String url = uid.replace('\u0000', '/'); // replace nulls with slashes |
86 |
+ return url.substring(0, url.lastIndexOf('/')); // remove date from end |
87 |
+ } |
88 |
+} |