Author: sergiykarpenko
Date: 2011-01-26 03:54:27 -0500 (Wed, 26 Jan 2011)
New Revision: 3887
Added:
core/trunk/exo.core.component.document/src/test/resources/pfs_accapp.pdf
Modified:
core/trunk/exo.core.component.document/pom.xml
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java
core/trunk/pom.xml
Log:
EXOJCR-1175: Implement PDFDocumentReader.getProperties using PDFBox
Modified: core/trunk/exo.core.component.document/pom.xml
===================================================================
--- core/trunk/exo.core.component.document/pom.xml 2011-01-26 08:25:40 UTC (rev 3886)
+++ core/trunk/exo.core.component.document/pom.xml 2011-01-26 08:54:27 UTC (rev 3887)
@@ -55,10 +55,6 @@
<artifactId>pdfbox</artifactId>
</dependency>
<dependency>
- <groupId>com.lowagie</groupId>
- <artifactId>itext</artifactId>
- </dependency>
- <dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
</dependency>
Modified:
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java 2011-01-26
08:25:40 UTC (rev 3886)
+++
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java 2011-01-26
08:54:27 UTC (rev 3887)
@@ -18,38 +18,30 @@
*/
package org.exoplatform.services.document.impl;
-import com.lowagie.text.pdf.PdfDate;
-import com.lowagie.text.pdf.PdfReader;
-
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchemaBasic;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.XMPSchemaPDF;
+import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.util.PDFTextStripper;
-import org.exoplatform.commons.utils.ISO8601;
-import org.exoplatform.commons.utils.SecurityHelper;
import org.exoplatform.services.document.DCMetaData;
import org.exoplatform.services.document.DocumentReadException;
import org.exoplatform.services.log.ExoLogger;
import org.exoplatform.services.log.Log;
-import org.w3c.dom.Document;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.security.AccessController;
import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
-import java.text.ParseException;
import java.util.Calendar;
-import java.util.HashMap;
import java.util.Properties;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-
/**
* Created by The eXo Platform SAS A parser of Adobe PDF files.
*
@@ -167,227 +159,279 @@
* @see org.exoplatform.services.document.DocumentReader#getProperties(java.io.
* InputStream)
*/
- public Properties getProperties(InputStream is) throws IOException,
DocumentReadException
+ public Properties getProperties(final InputStream is) throws IOException,
DocumentReadException
{
+ try
+ {
+ return (Properties)AccessController.doPrivileged(new
PrivilegedExceptionAction<Object>()
+ {
+ public Object run() throws Exception
+ {
+ if (is == null)
+ {
+ throw new NullPointerException("InputStream is null.");
+ }
- Properties props = null;
+ PDDocument pdDocument = PDDocument.load(is);
+ Properties props = new Properties();
+ try
+ {
+ if (pdDocument.isEncrypted())
+ {
+ try
+ {
+ pdDocument.decrypt("");
+ }
+ catch (InvalidPasswordException e)
+ {
+ throw new DocumentReadException("The pdf document is
encrypted.", e);
+ }
+ catch (org.apache.pdfbox.exceptions.CryptographyException e)
+ {
+ throw new DocumentReadException(e.getMessage(), e);
+ }
+ }
- PdfReader reader = new PdfReader(is, "".getBytes());
+ PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
+ PDMetadata meta = catalog.getMetadata();
+ if (meta != null)
+ {
+ XMPMetadata metadata = meta.exportXMPMetadata();
- // Read the file metadata
- byte[] metadata = reader.getMetadata();
+ XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
+ if (dc != null)
+ {
+ try
+ {
+ if (dc.getTitle() != null)
+ props.put(DCMetaData.TITLE, dc.getTitle());
+ }
+ catch (Exception e)
+ {
+ log.warn("getTitle failed: " + e);
+ }
+ try
+ {
+ if (dc.getDescription() != null)
+ props.put(DCMetaData.DESCRIPTION, dc.getDescription());
+ }
+ catch (Exception e)
+ {
+ log.warn("getSubject failed: " + e);
+ }
- if (metadata != null)
- {
- // there is XMP metadata try exctract it
- props = getPropertiesFromMetadata(metadata);
- }
+ try
+ {
+ if (dc.getCreators() != null)
+ {
+ for (String creator : dc.getCreators())
+ {
+ props.put(DCMetaData.CREATOR, creator);
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ log.warn("getCreator failed: " + e);
+ }
- if (props == null)
- {
- // it's old pdf document version
- props = getPropertiesFromInfo(reader.getInfo());
- }
- reader.close();
- if (is != null)
- try
- {
- is.close();
- }
- catch (IOException e)
- {
- }
- return props;
- }
+ try
+ {
+ if (dc.getDates() != null)
+ {
+ for (Calendar date : dc.getDates())
+ {
+ props.put(DCMetaData.DATE, date);
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ log.warn("getDate failed: " + e);
+ }
+ }
- /**
- * Extract properties from XMP xml.
- *
- * @param metadata XML as byte array
- * @return extracted properties
- * @throws DocumentReadException
- * @throws Exception if extracting fails
- */
- protected Properties getPropertiesFromMetadata(final byte[] metadata) throws
IOException, DocumentReadException
- {
+ XMPSchemaPDF pdf = metadata.getPDFSchema();
+ if (pdf != null)
+ {
+ try
+ {
+ if (pdf.getKeywords() != null)
+ props.put(DCMetaData.SUBJECT, pdf.getKeywords());
+ }
+ catch (Exception e)
+ {
+ log.warn("getKeywords failed: " + e);
+ }
- Properties props = null;
+ try
+ {
+ if (pdf.getProducer() != null)
+ props.put(DCMetaData.PUBLISHER, pdf.getProducer());
+ }
+ catch (Exception e)
+ {
+ log.warn("getProducer failed: " + e);
+ }
+ }
- // parse xml
- Document doc;
- try
- {
- final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
- doc = SecurityHelper.doPrivilegedExceptionAction(new
PrivilegedExceptionAction<Document>()
- {
- public Document run() throws Exception
- {
- DocumentBuilder docBuilder = dbf.newDocumentBuilder();
- return docBuilder.parse(new ByteArrayInputStream(metadata));
+ XMPSchemaBasic basic = metadata.getBasicSchema();
+ if (basic != null)
+ {
+ try
+ {
+ if (basic.getCreateDate() != null)
+ props.put(DCMetaData.DATE, basic.getCreateDate());
+ }
+ catch (Exception e)
+ {
+ log.warn("getCreationDate failed: " + e);
+ }
+ try
+ {
+ if (basic.getModifyDate() != null)
+ props.put(DCMetaData.DATE, basic.getModifyDate());
+ }
+ catch (Exception e)
+ {
+ log.warn("getModificationDate failed: " + e);
+ }
+ // try
+ // {
+ // if (basic.getCreatorTool() != null)
+ // props.put(DCMetaData.PUBLISHER, basic.getCreatorTool());
+ // }
+ // catch (Exception e)
+ // {
+ // log.warn("getCreatorTool failed: " + e);
+ // }
+ }
+ }
+ else
+ {
+ // The pdf doesn't contain any metadata, try to use the
document
+ // information instead
+ PDDocumentInformation docInfo =
pdDocument.getDocumentInformation();
+
+ if (docInfo != null)
+ {
+ try
+ {
+ if (docInfo.getAuthor() != null)
+ props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
+ }
+ catch (Exception e)
+ {
+ log.warn("getAuthor failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getCreationDate() != null)
+ props.put(DCMetaData.DATE, docInfo.getCreationDate());
+ }
+ catch (Exception e)
+ {
+ log.warn("getCreationDate failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getCreator() != null)
+ props.put(DCMetaData.CREATOR, docInfo.getCreator());
+ }
+ catch (Exception e)
+ {
+ log.warn("getCreator failed: " + e);
+ }
+ try
+ {
+
+ if (docInfo.getKeywords() != null)
+ props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
+ }
+ catch (Exception e)
+ {
+ log.warn("getKeywords failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getModificationDate() != null)
+ props.put(DCMetaData.DATE, docInfo.getModificationDate());
+ }
+ catch (Exception e)
+ {
+ log.warn("getModificationDate failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getProducer() != null)
+ props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
+ }
+ catch (Exception e)
+ {
+ log.warn("getProducer failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getSubject() != null)
+ props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
+ }
+ catch (Exception e)
+ {
+ log.warn("getSubject failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getTitle() != null)
+ props.put(DCMetaData.TITLE, docInfo.getTitle());
+ }
+ catch (Exception e)
+ {
+ log.warn("getTitle failed: " + e);
+ }
+
+ // docInfo.getTrapped();
+ }
+ }
+ }
+ finally
+ {
+ if (pdDocument != null)
+ {
+ pdDocument.close();
+ }
+
+ if (is != null)
+ {
+ try
+ {
+ is.close();
+ }
+ catch (IOException e)
+ {
+ }
+ }
+ }
+ return props;
}
});
+
}
catch (PrivilegedActionException pae)
{
Throwable cause = pae.getCause();
- if (cause instanceof SAXException)
+ if (cause instanceof IOException)
{
- throw new DocumentReadException(cause.getMessage(), cause);
- }
- else if (cause instanceof ParserConfigurationException)
- {
- throw (RuntimeException)cause;
- }
- else if (cause instanceof IOException)
- {
throw (IOException)cause;
}
else if (cause instanceof RuntimeException)
{
- throw new DocumentReadException(cause.getMessage(), cause);
+ throw (RuntimeException)cause;
}
else
{
throw new RuntimeException(cause);
}
}
-
- // Check is there PDF/A-1 XMP
- String version = "";
- NodeList list = doc.getElementsByTagName("pdfaid:conformance");
- if (list != null && list.item(0) != null)
- {
- version += list.item(0).getTextContent() + "-";
- }
-
- list = doc.getElementsByTagName("pdfaid:part");
- if (list != null && list.item(0) != null)
- {
- version += list.item(0).getTextContent();
- }
-
- // PDF/A-1a or PDF/A-1b
- if (version.equalsIgnoreCase("A-1"))
- {
- props = getPropsFromPDFAMetadata(doc);
- }
-
- return props;
}
- /**
- * Extracts properties from PDF Info hash set.
- *
- * @param Pdf Info hash set
- * @return Extracted properties
- * @throws Exception if extracting fails
- */
- @SuppressWarnings("unchecked")
- protected Properties getPropertiesFromInfo(HashMap info) throws IOException
- {
- Properties props = new Properties();
-
- String title = (String)info.get("Title");
- if (title != null)
- {
- props.put(DCMetaData.TITLE, title);
- }
-
- String author = (String)info.get("Author");
- if (author != null)
- {
- props.put(DCMetaData.CREATOR, author);
- }
-
- String subject = (String)info.get("Subject");
- if (subject != null)
- {
- props.put(DCMetaData.SUBJECT, subject);
- }
-
- String creationDate = (String)info.get("CreationDate");
- if (creationDate != null)
- {
- props.put(DCMetaData.DATE, PdfDate.decode(creationDate));
- }
-
- String modDate = (String)info.get("ModDate");
- if (modDate != null)
- {
- props.put(DCMetaData.DATE, PdfDate.decode(modDate));
- }
-
- return props;
- }
-
- private Properties getPropsFromPDFAMetadata(Document doc) throws IOException,
DocumentReadException
- {
- Properties props = new Properties();
- // get properties
- NodeList list = doc.getElementsByTagName("rdf:li");
- if (list != null && list.getLength() > 0)
- {
- for (int i = 0; i < list.getLength(); i++)
- {
-
- Node n = list.item(i);
- // dc:title - TITLE
- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:title"))
- {
- String title = n.getLastChild().getTextContent();
- props.put(DCMetaData.TITLE, title);
- }
-
- // dc:creator - CREATOR
- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:creator"))
- {
- String author = n.getLastChild().getTextContent();
- props.put(DCMetaData.CREATOR, author);
- }
-
- // DC:description - SUBJECT
- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:description"))
- {
- String description = n.getLastChild().getTextContent();
- props.put(DCMetaData.SUBJECT, description);
- // props.put(DCMetaData.DESCRIPTION, description);
- }
- }
- }
-
- try
- {
- // xmp:CreateDate - DATE
- list = doc.getElementsByTagName("xmp:CreateDate");
- if (list != null && list.item(0) != null)
- {
- Node creationDateNode = list.item(0).getLastChild();
- if (creationDateNode != null)
- {
- String creationDate = creationDateNode.getTextContent();
- Calendar c = ISO8601.parseEx(creationDate);
- props.put(DCMetaData.DATE, c);
- }
- }
-
- // xmp:ModifyDate - DATE
- list = doc.getElementsByTagName("xmp:ModifyDate");
- if (list != null && list.item(0) != null)
- {
- Node modifyDateNode = list.item(0).getLastChild();
- if (modifyDateNode != null)
- {
- String modifyDate = modifyDateNode.getTextContent();
- Calendar c = ISO8601.parseEx(modifyDate);
- props.put(DCMetaData.DATE, c);
- }
- }
- }
- catch (ParseException e)
- {
- throw new DocumentReadException(e.getMessage(), e);
- }
- return props;
- }
-
}
Modified:
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java
===================================================================
---
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java 2011-01-26
08:25:40 UTC (rev 3886)
+++
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java 2011-01-26
08:54:27 UTC (rev 3887)
@@ -18,7 +18,6 @@
*/
package org.exoplatform.services.document.test;
-import org.exoplatform.commons.utils.ISO8601;
import org.exoplatform.container.xml.InitParams;
import org.exoplatform.services.document.DCMetaData;
import org.exoplatform.services.document.DocumentReader;
@@ -77,10 +76,10 @@
Properties etalon = new Properties();
etalon.put(DCMetaData.TITLE, "Test de convertion de fichier tif");
etalon.put(DCMetaData.CREATOR, "Christian Klaus");
- etalon.put(DCMetaData.SUBJECT, "20080901 TEST Christian Etat OK");
- Calendar c = ISO8601.parseEx("2008-09-01T08:01:10+00:00");
- etalon.put(DCMetaData.DATE, c);
- evalProps(etalon, testprops);
+ etalon.put(DCMetaData.DESCRIPTION, "20080901 TEST Christian Etat
OK");
+ // Calendar c = ISO8601.parseEx("2008-09-01T08:01:10+00:00");
+ // etalon.put(DCMetaData.DATE, c);
+ evalProps(etalon, testprops, false);
}
finally
{
@@ -88,6 +87,26 @@
}
}
+ public void testPDFDocumentReaderServiceBrokenFile() throws Exception
+ {
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/pfs_accapp.pdf");
+ try
+ {
+
+ DocumentReader rdr = service.getDocumentReader("application/pdf");
+ Properties testprops = rdr.getProperties(is);
+ Properties etalon = new Properties();
+ etalon.put(DCMetaData.TITLE, "Personal Account Opening Form VN");
+ etalon.put(DCMetaData.CREATOR, "mr");
+ etalon.put(DCMetaData.PUBLISHER, "Adobe LiveCycle Designer ES 8.2");
+ evalProps(etalon, testprops, false);
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
public void testWordDocumentReaderService() throws Exception
{
InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.doc");
@@ -101,7 +120,7 @@
etalon.put(DCMetaData.CREATOR, "Max Yakimenko");
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -122,7 +141,7 @@
etalon.put(DCMetaData.CREATOR, "Max Yakimenko");
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -144,7 +163,7 @@
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -172,7 +191,7 @@
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -200,7 +219,7 @@
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -228,7 +247,7 @@
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -254,7 +273,7 @@
etalon.put(DCMetaData.CREATOR, "Sergiy Karpenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -262,7 +281,7 @@
}
}
- private void evalProps(Properties etalon, Properties testedProps)
+ private void evalProps(Properties etalon, Properties testedProps, boolean testSize)
{
Iterator it = etalon.entrySet().iterator();
while (it.hasNext())
@@ -272,7 +291,10 @@
assertNotNull(prop.getKey() + " property not founded. ", tval);
assertEquals(prop.getKey() + " property value is incorrect",
prop.getValue(), tval);
}
- assertEquals("size is incorrect", etalon.size(), testedProps.size());
+ if (testSize)
+ {
+ assertEquals("size is incorrect", etalon.size(), testedProps.size());
+ }
}
}
Added: core/trunk/exo.core.component.document/src/test/resources/pfs_accapp.pdf
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/pfs_accapp.pdf
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Modified: core/trunk/pom.xml
===================================================================
--- core/trunk/pom.xml 2011-01-26 08:25:40 UTC (rev 3886)
+++ core/trunk/pom.xml 2011-01-26 08:54:27 UTC (rev 3887)
@@ -272,15 +272,7 @@
<artifactId>pdfbox</artifactId>
<version>1.1.0</version>
</dependency>
-
<dependency>
- <groupId>com.lowagie</groupId>
- <artifactId>itext</artifactId>
- <version>2.1.0</version>
- <scope>compile</scope>
- </dependency>
-
- <dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>1.6</version>