1 | package org.greenstone.gsdl3.gs3build.doctypes;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.net.*;
|
---|
5 |
|
---|
6 | import org.greenstone.gsdl3.gs3build.metadata.*;
|
---|
7 | import org.greenstone.gsdl3.gs3build.util.HTTPTools;
|
---|
8 |
|
---|
9 | public class HTMLRecogniser implements RecogniserInterface
|
---|
10 | {
|
---|
11 | DocumentList listRepository;
|
---|
12 |
|
---|
13 | public HTMLRecogniser(DocumentList listRepository)
|
---|
14 | { this.listRepository = listRepository;
|
---|
15 | }
|
---|
16 |
|
---|
17 | public boolean parseDocument(METSFile file)
|
---|
18 | {
|
---|
19 | String MIMEType = file.getMIMEType();
|
---|
20 | if (MIMEType == null ||
|
---|
21 | MIMEType.equals("text/html")) {
|
---|
22 | URL location = file.getLocation();
|
---|
23 | return this.parseDocument(location);
|
---|
24 | }
|
---|
25 | return false;
|
---|
26 | }
|
---|
27 |
|
---|
28 | public boolean parseDocument(URL url)
|
---|
29 | { String fileName = null;
|
---|
30 |
|
---|
31 | if (url.toString().startsWith("file://")) {
|
---|
32 | fileName = url.toString().substring(7);
|
---|
33 | }
|
---|
34 | else if (url.toString().startsWith("file:/")) {
|
---|
35 | fileName = url.toString().substring(5);
|
---|
36 | }
|
---|
37 |
|
---|
38 | if (fileName != null) {
|
---|
39 | if (fileName.endsWith(".htm") ||
|
---|
40 | fileName.endsWith(".html"))
|
---|
41 | { System.out.println("Posting HTML Document " + fileName);
|
---|
42 |
|
---|
43 | HTMLDocument doc = new HTMLDocument(url);
|
---|
44 | this.listRepository.addDocument(doc);
|
---|
45 | return true;
|
---|
46 | }
|
---|
47 | }
|
---|
48 | else {
|
---|
49 | // Get Mime type remotely, and then proceed if required
|
---|
50 | String mimeType = HTTPTools.getMIMEType(url);
|
---|
51 |
|
---|
52 | if (mimeType == "text/html")
|
---|
53 | { System.out.println("Posting HTML Document " + url.toString());
|
---|
54 |
|
---|
55 | HTMLDocument doc = new HTMLDocument(url);
|
---|
56 | this.listRepository.addDocument(doc);
|
---|
57 | return true;
|
---|
58 | }
|
---|
59 | }
|
---|
60 | return false;
|
---|
61 | }
|
---|
62 | }
|
---|