Author: tolusha
Date: 2011-02-08 11:34:46 -0500 (Tue, 08 Feb 2011)
New Revision: 3949
Added:
core/branches/2.3.x/patch/2.3.8/
core/branches/2.3.x/patch/2.3.8/COR-228/
core/branches/2.3.x/patch/2.3.8/COR-228/COR-228.patch
Log:
COR-228: patch proposed
Added: core/branches/2.3.x/patch/2.3.8/COR-228/COR-228.patch
===================================================================
--- core/branches/2.3.x/patch/2.3.8/COR-228/COR-228.patch (rev 0)
+++ core/branches/2.3.x/patch/2.3.8/COR-228/COR-228.patch 2011-02-08 16:34:46 UTC (rev
3949)
@@ -0,0 +1,615 @@
+Index:
exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java
+===================================================================
+---
exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java (revision
3930)
++++
exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java (working
copy)
+@@ -18,37 +18,30 @@
+ */
+ package org.exoplatform.services.document.impl;
+
+-import com.lowagie.text.pdf.PdfDate;
+-import com.lowagie.text.pdf.PdfReader;
+-
++import org.apache.jempbox.xmp.XMPMetadata;
++import org.apache.jempbox.xmp.XMPSchemaBasic;
++import org.apache.jempbox.xmp.XMPSchemaDublinCore;
++import org.apache.pdfbox.exceptions.InvalidPasswordException;
+ import org.apache.pdfbox.pdmodel.PDDocument;
++import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
++import org.apache.pdfbox.pdmodel.PDDocumentInformation;
++import org.apache.pdfbox.pdmodel.common.PDMetadata;
+ import org.apache.pdfbox.util.PDFTextStripper;
+-import org.exoplatform.commons.utils.ISO8601;
+ import org.exoplatform.services.document.DCMetaData;
+ import org.exoplatform.services.document.DocumentReadException;
+ import org.exoplatform.services.log.ExoLogger;
+ import org.exoplatform.services.log.Log;
+-import org.w3c.dom.Document;
+-import org.w3c.dom.Node;
+-import org.w3c.dom.NodeList;
+-import org.xml.sax.SAXException;
+
+-import java.io.ByteArrayInputStream;
+ import java.io.IOException;
+ import java.io.InputStream;
+ import java.io.StringWriter;
+ import java.security.AccessController;
+ import java.security.PrivilegedActionException;
+ import java.security.PrivilegedExceptionAction;
+-import java.text.ParseException;
+ import java.util.Calendar;
+-import java.util.HashMap;
++import java.util.List;
+ import java.util.Properties;
+
+-import javax.xml.parsers.DocumentBuilder;
+-import javax.xml.parsers.DocumentBuilderFactory;
+-import javax.xml.parsers.ParserConfigurationException;
+-
+ /**
+ * Created by The eXo Platform SAS A parser of Adobe PDF files.
+ *
+@@ -168,204 +161,186 @@
+ */
+ public Properties getProperties(InputStream is) throws IOException,
DocumentReadException
+ {
+-
+- Properties props = null;
+-
+- PdfReader reader = new PdfReader(is, "".getBytes());
+-
+- // Read the file metadata
+- byte[] metadata = reader.getMetadata();
+-
+- if (metadata != null)
+- {
+- // there is XMP metadata try exctract it
+- props = getPropertiesFromMetadata(metadata);
+- }
+-
+- if (props == null)
+- {
+- // it's old pdf document version
+- props = getPropertiesFromInfo(reader.getInfo());
+- }
+- reader.close();
+- if (is != null)
+- try
+- {
+- is.close();
+- }
+- catch (IOException e)
+- {
+- }
+- return props;
+- }
+-
+- /**
+- * Extract properties from XMP xml.
+- *
+- * @param metadata XML as byte array
+- * @return extracted properties
+- * @throws DocumentReadException
+- * @throws Exception if extracting fails
+- */
+- protected Properties getPropertiesFromMetadata(byte[] metadata) throws IOException,
DocumentReadException
+- {
+-
+- Properties props = null;
+-
+- // parse xml
+-
+- Document doc;
++ PDDocument pdDocument = PDDocument.load(is);
++ Properties props = new Properties();
+ try
+ {
+- DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+- DocumentBuilder docBuilder = dbf.newDocumentBuilder();
+- doc = docBuilder.parse(new ByteArrayInputStream(metadata));
+- }
+- catch (SAXException e)
+- {
+- throw new DocumentReadException(e.getMessage(), e);
+- }
+- catch (ParserConfigurationException e)
+- {
+- throw new DocumentReadException(e.getMessage(), e);
+- }
+-
+- // Check is there PDF/A-1 XMP
+- String version = "";
+- NodeList list = doc.getElementsByTagName("pdfaid:conformance");
+- if (list != null && list.item(0) != null)
+- {
+- version += list.item(0).getTextContent() + "-";
+- }
+-
+- list = doc.getElementsByTagName("pdfaid:part");
+- if (list != null && list.item(0) != null)
+- {
+- version += list.item(0).getTextContent();
+- }
+-
+- // PDF/A-1a or PDF/A-1b
+- if (version.equalsIgnoreCase("A-1"))
+- {
+- props = getPropsFromPDFAMetadata(doc);
+- }
+-
+- return props;
+- }
+-
+- /**
+- * Extracts properties from PDF Info hash set.
+- *
+- * @param Pdf Info hash set
+- * @return Extracted properties
+- * @throws Exception if extracting fails
+- */
+- @SuppressWarnings("unchecked")
+- protected Properties getPropertiesFromInfo(HashMap info) throws IOException
+- {
+- Properties props = new Properties();
+-
+- String title = (String)info.get("Title");
+- if (title != null)
+- {
+- props.put(DCMetaData.TITLE, title);
+- }
+-
+- String author = (String)info.get("Author");
+- if (author != null)
+- {
+- props.put(DCMetaData.CREATOR, author);
+- }
+-
+- String subject = (String)info.get("Subject");
+- if (subject != null)
+- {
+- props.put(DCMetaData.SUBJECT, subject);
+- }
+-
+- String creationDate = (String)info.get("CreationDate");
+- if (creationDate != null)
+- {
+- props.put(DCMetaData.DATE, PdfDate.decode(creationDate));
+- }
+-
+- String modDate = (String)info.get("ModDate");
+- if (modDate != null)
+- {
+- props.put(DCMetaData.DATE, PdfDate.decode(modDate));
+- }
+-
+- return props;
+- }
+-
+- private Properties getPropsFromPDFAMetadata(Document doc) throws IOException,
DocumentReadException
+- {
+- Properties props = new Properties();
+- // get properties
+- NodeList list = doc.getElementsByTagName("rdf:li");
+- if (list != null && list.getLength() > 0)
+- {
+- for (int i = 0; i < list.getLength(); i++)
++ if (pdDocument.isEncrypted())
+ {
+-
+- Node n = list.item(i);
+- // dc:title - TITLE
+- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:title"))
++ try
+ {
+- String title = n.getLastChild().getTextContent();
+- props.put(DCMetaData.TITLE, title);
++ pdDocument.decrypt("");
+ }
+-
+- // dc:creator - CREATOR
+- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:creator"))
++ catch (InvalidPasswordException e)
+ {
+- String author = n.getLastChild().getTextContent();
+- props.put(DCMetaData.CREATOR, author);
++ throw new DocumentReadException("The pdf document is
encrypted.", e);
+ }
+-
+- // DC:description - SUBJECT
+- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:description"))
++ catch (org.apache.pdfbox.exceptions.CryptographyException e)
+ {
+- String description = n.getLastChild().getTextContent();
+- props.put(DCMetaData.SUBJECT, description);
+- // props.put(DCMetaData.DESCRIPTION, description);
++ throw new DocumentReadException(e.getMessage(), e);
+ }
+ }
+- }
+
+- try
+- {
+- // xmp:CreateDate - DATE
+- list = doc.getElementsByTagName("xmp:CreateDate");
+- if (list != null && list.item(0) != null)
++ PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
++ PDMetadata meta = catalog.getMetadata();
++ if (meta != null)
+ {
+- Node creationDateNode = list.item(0).getLastChild();
+- if (creationDateNode != null)
++ XMPMetadata metadata = meta.exportXMPMetadata();
++
++ XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
++ if (dc != null)
+ {
+- String creationDate = creationDateNode.getTextContent();
+- Calendar c = ISO8601.parseEx(creationDate);
+- props.put(DCMetaData.DATE, c);
++ try
++ {
++ if (dc.getTitle() != null)
++ props.put(DCMetaData.TITLE, dc.getTitle());
++ }
++ catch (Exception e)
++ {
++ log.warn("getTitle failed: " + e);
++ }
++ try
++ {
++ if (dc.getDescription() != null)
++ props.put(DCMetaData.SUBJECT, dc.getDescription());
++ }
++ catch (Exception e)
++ {
++ log.warn("getSubject failed: " + e);
++ }
++
++ try
++ {
++ if (dc.getCreators() != null)
++ {
++ List<String> list = dc.getCreators();
++ for (String creator : list)
++ {
++ props.put(DCMetaData.CREATOR, creator);
++ }
++ }
++ }
++ catch (Exception e)
++ {
++ log.warn("getCreator failed: " + e);
++ }
++
++ try
++ {
++ if (dc.getDates() != null)
++ {
++ List<Calendar> list = dc.getDates();
++
++ for (Calendar date : list)
++ {
++ props.put(DCMetaData.DATE, date);
++ }
++ }
++ }
++ catch (Exception e)
++ {
++ log.warn("getDate failed: " + e);
++ }
+ }
++
++ XMPSchemaBasic basic = metadata.getBasicSchema();
++ if (basic != null)
++ {
++ try
++ {
++ if (basic.getCreateDate() != null)
++ props.put(DCMetaData.DATE, basic.getCreateDate());
++ }
++ catch (Exception e)
++ {
++ log.warn("getCreationDate failed: " + e);
++ }
++ try
++ {
++ if (basic.getModifyDate() != null)
++ props.put(DCMetaData.DATE, basic.getModifyDate());
++ }
++ catch (Exception e)
++ {
++ log.warn("getModificationDate failed: " + e);
++ }
++ }
+ }
+
+- // xmp:ModifyDate - DATE
+- list = doc.getElementsByTagName("xmp:ModifyDate");
+- if (list != null && list.item(0) != null)
++ if (props.isEmpty())
+ {
+- Node modifyDateNode = list.item(0).getLastChild();
+- if (modifyDateNode != null)
++ // The pdf doesn't contain any XMP metadata or XMP metadata do not
contains any
++ // usefull data, try to use the document information instead
++ PDDocumentInformation docInfo = pdDocument.getDocumentInformation();
++
++ if (docInfo != null)
+ {
+- String modifyDate = modifyDateNode.getTextContent();
+- Calendar c = ISO8601.parseEx(modifyDate);
+- props.put(DCMetaData.DATE, c);
++ try
++ {
++ if (docInfo.getCreationDate() != null)
++ props.put(DCMetaData.DATE, docInfo.getCreationDate());
++ }
++ catch (Exception e)
++ {
++ log.warn("getCreationDate failed: " + e);
++ }
++ try
++ {
++ if (docInfo.getCreator() != null)
++ props.put(DCMetaData.CREATOR, docInfo.getCreator());
++ }
++ catch (Exception e)
++ {
++ log.warn("getCreator failed: " + e);
++ }
++ try
++ {
++
++ if (docInfo.getKeywords() != null)
++ props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
++ }
++ catch (Exception e)
++ {
++ log.warn("getKeywords failed: " + e);
++ }
++ try
++ {
++ if (docInfo.getModificationDate() != null)
++ props.put(DCMetaData.DATE, docInfo.getModificationDate());
++ }
++ catch (Exception e)
++ {
++ log.warn("getModificationDate failed: " + e);
++ }
++ try
++ {
++ if (docInfo.getSubject() != null)
++ props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
++ }
++ catch (Exception e)
++ {
++ log.warn("getSubject failed: " + e);
++ }
++ try
++ {
++ if (docInfo.getTitle() != null)
++ props.put(DCMetaData.TITLE, docInfo.getTitle());
++ }
++ catch (Exception e)
++ {
++ log.warn("getTitle failed: " + e);
++ }
+ }
+ }
+ }
+- catch (ParseException e)
++ finally
+ {
+- throw new DocumentReadException(e.getMessage(), e);
++ if (pdDocument != null)
++ {
++ pdDocument.close();
++ }
+ }
++
+ return props;
+ }
+-
+ }
+Index:
exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java
+===================================================================
+---
exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java (revision
3930)
++++
exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java (working
copy)
+@@ -18,7 +18,6 @@
+ */
+ package org.exoplatform.services.document.test;
+
+-import org.exoplatform.commons.utils.ISO8601;
+ import org.exoplatform.services.document.DCMetaData;
+ import org.exoplatform.services.document.DocumentReader;
+ import org.exoplatform.services.document.DocumentReaderService;
+@@ -41,20 +40,48 @@
+ service =
(DocumentReaderService)getComponentInstanceOfType(DocumentReaderService.class);
+ }
+
++ public void testPDFDocumentReaderService() throws Exception
++ {
++ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.pdf");
++ DocumentReader rdr = service.getDocumentReader("application/pdf");
++ Properties props = rdr.getProperties(is);
++ assertTrue(props.isEmpty());
++
++ // Properties etalon = new Properties();
++ // etalon.put(DCMetaData.PUBLISHER, "FOP 0.20.4");
++ // evalProps(etalon, props, false);
++ }
++
+ public void testPDFDocumentReaderServiceXMPMetadata() throws Exception
+ {
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/MyTest.pdf");
++ DocumentReader rdr = service.getDocumentReader("application/pdf");
++
++ Properties testprops = rdr.getProperties(is);
++ // printProps(testprops);
++
++ Properties etalon = new Properties();
++ etalon.put(DCMetaData.TITLE, "Test de convertion de fichier tif");
++ etalon.put(DCMetaData.CREATOR, "Christian Klaus");
++ etalon.put(DCMetaData.SUBJECT, "20080901 TEST Christian Etat OK");
++ // Calendar c = ISO8601.parseEx("2008-09-01T08:01:10+00:00");
++ // etalon.put(DCMetaData.DATE, c);
++
++ evalProps(etalon, testprops, false);
++ }
++
++ public void testPDFDocumentReaderServiceBrokenFile() throws Exception
++ {
++ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/pfs_accapp.pdf");
+ try
+ {
++
+ DocumentReader rdr = service.getDocumentReader("application/pdf");
+ Properties testprops = rdr.getProperties(is);
+ Properties etalon = new Properties();
+- etalon.put(DCMetaData.TITLE, "Test de convertion de fichier tif");
+- etalon.put(DCMetaData.CREATOR, "Christian Klaus");
+- etalon.put(DCMetaData.SUBJECT, "20080901 TEST Christian Etat OK");
+- Calendar c = ISO8601.parseEx("2008-09-01T08:01:10+00:00");
+- etalon.put(DCMetaData.DATE, c);
+- evalProps(etalon, testprops);
++ etalon.put(DCMetaData.TITLE, "Personal Account Opening Form VN");
++ etalon.put(DCMetaData.CREATOR, "mr");
++ evalProps(etalon, testprops, false);
+ }
+ finally
+ {
+@@ -62,6 +89,25 @@
+ }
+ }
+
++ public void testPDFDocumentReaderServiceMetro() throws Exception
++ {
++ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/metro.pdf");
++ try
++ {
++
++ DocumentReader rdr = service.getDocumentReader("application/pdf");
++ Properties testprops = rdr.getProperties(is);
++ Properties etalon = new Properties();
++ etalon.put(DCMetaData.TITLE, "metro");
++ etalon.put(DCMetaData.CREATOR, "Preview");
++ evalProps(etalon, testprops, false);
++ }
++ finally
++ {
++ is.close();
++ }
++ }
++
+ public void testWordDocumentReaderService() throws Exception
+ {
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.doc");
+@@ -75,7 +121,7 @@
+ etalon.put(DCMetaData.CREATOR, "Max Yakimenko");
+ etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
+ etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
+- evalProps(etalon, props);
++ evalProps(etalon, props, true);
+ }
+ finally
+ {
+@@ -96,7 +142,7 @@
+ etalon.put(DCMetaData.CREATOR, "Max Yakimenko");
+ etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
+ etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
+- evalProps(etalon, props);
++ evalProps(etalon, props, true);
+ }
+ finally
+ {
+@@ -118,7 +164,7 @@
+ etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
+ etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
+
+- evalProps(etalon, props);
++ evalProps(etalon, props, true);
+ }
+ finally
+ {
+@@ -146,7 +192,7 @@
+ etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
+ etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
+
+- evalProps(etalon, props);
++ evalProps(etalon, props, true);
+ }
+ finally
+ {
+@@ -174,7 +220,7 @@
+ etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
+ etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
+
+- evalProps(etalon, props);
++ evalProps(etalon, props, true);
+ }
+ finally
+ {
+@@ -202,7 +248,7 @@
+ etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
+ etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
+
+- evalProps(etalon, props);
++ evalProps(etalon, props, true);
+ }
+ finally
+ {
+@@ -228,7 +274,7 @@
+ etalon.put(DCMetaData.CREATOR, "nikolaz ");
+ etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
+
+- evalProps(etalon, props);
++ evalProps(etalon, props, true);
+ }
+ finally
+ {
+@@ -236,7 +282,7 @@
+ }
+ }
+
+- private void evalProps(Properties etalon, Properties testedProps)
++ private void evalProps(Properties etalon, Properties testedProps, boolean testSize)
+ {
+ Iterator it = etalon.entrySet().iterator();
+ while (it.hasNext())
+@@ -244,13 +290,12 @@
+ Map.Entry prop = (Map.Entry)it.next();
+ Object tval = testedProps.get(prop.getKey());
+ assertNotNull(prop.getKey() + " property not founded. ", tval);
+- if (tval instanceof Date)
+- {
+- System.out.println("was:" + ((Date)tval).getTime() + "
expected: " + ((Date)prop.getValue()).getTime());
+- }
+ assertEquals(prop.getKey() + " property value is incorrect",
prop.getValue(), tval);
+ }
+- assertEquals("size is incorrect", etalon.size(), testedProps.size());
++ if (testSize)
++ {
++ assertEquals("size is incorrect", etalon.size(),
testedProps.size());
++ }
+ }
+
+ }
+Index: exo.core.component.document/src/test/resources/metro.pdf
+===================================================================
+Cannot display: file marked as a binary type.
+svn:mime-type = application/octet-stream
+
+Property changes on: exo.core.component.document\src\test\resources\metro.pdf
+___________________________________________________________________
+Added: svn:mime-type
+ + application/octet-stream
+
+Index: exo.core.component.document/src/test/resources/pfs_accapp.pdf
+===================================================================
+Cannot display: file marked as a binary type.
+svn:mime-type = application/octet-stream
+
+Property changes on: exo.core.component.document\src\test\resources\pfs_accapp.pdf
+___________________________________________________________________
+Added: svn:mime-type
+ + application/octet-stream
+