Author: paristote
Date: 2011-06-10 04:31:59 -0400 (Fri, 10 Jun 2011)
New Revision: 4495
Added:
core/branches/2.3.x/exo.core.component.document/src/test/resources/Trait_union.06.Mai_2009.pdf
core/branches/2.3.x/exo.core.component.document/src/test/resources/metro.pdf
core/branches/2.3.x/exo.core.component.document/src/test/resources/pfs_accapp.pdf
core/branches/2.3.x/patch/2.3.8/COR-228/readme.txt
Modified:
core/branches/2.3.x/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java
core/branches/2.3.x/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java
core/branches/2.3.x/packaging/module/src/main/javascript/core.packaging.module.js
Log:
COR-228
What is the problem to fix?
Implementation that uses iText does not support many XMP metadata. Make new
implementation of PdfDocumentReader.getProperties() using PdfBox instead of iText.
How is the problem fixed?
Use PdfBox to extract XMP metadata.
iText was removed from code.
Modified:
core/branches/2.3.x/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java
===================================================================
---
core/branches/2.3.x/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java 2011-06-09
14:54:48 UTC (rev 4494)
+++
core/branches/2.3.x/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java 2011-06-10
08:31:59 UTC (rev 4495)
@@ -18,37 +18,28 @@
*/
package org.exoplatform.services.document.impl;
-import com.lowagie.text.pdf.PdfDate;
-import com.lowagie.text.pdf.PdfReader;
-
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchemaBasic;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.util.PDFTextStripper;
-import org.exoplatform.commons.utils.ISO8601;
import org.exoplatform.services.document.DCMetaData;
import org.exoplatform.services.document.DocumentReadException;
import org.exoplatform.services.log.ExoLogger;
import org.exoplatform.services.log.Log;
-import org.w3c.dom.Document;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
-import java.security.AccessController;
-import java.security.PrivilegedActionException;
-import java.security.PrivilegedExceptionAction;
-import java.text.ParseException;
+import java.io.UnsupportedEncodingException;
import java.util.Calendar;
-import java.util.HashMap;
+import java.util.List;
import java.util.Properties;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-
/**
* Created by The eXo Platform SAS A parser of Adobe PDF files.
*
@@ -79,79 +70,51 @@
*/
public String getContentAsText(final InputStream is) throws IOException,
DocumentReadException
{
-
+ if (is == null)
+ {
+ throw new NullPointerException("InputStream is null.");
+ }
+ PDDocument pdDocument = null;
+ StringWriter sw = new StringWriter();
try
{
- return (String)AccessController.doPrivileged(new
PrivilegedExceptionAction<Object>()
- {
- public Object run() throws Exception
- {
- if (is == null)
- {
- throw new NullPointerException("InputStream is null.");
- }
- PDDocument pdDocument = null;
- StringWriter sw = new StringWriter();
- try
- {
- if (is.available() == 0)
- return "";
+ if (is.available() == 0)
+ return "";
- try
- {
- pdDocument = PDDocument.load(is);
- }
- catch (IOException e)
- {
- throw new DocumentReadException("Can not load PDF
document.", e);
- }
-
- PDFTextStripper stripper = new PDFTextStripper();
- stripper.setStartPage(1);
- stripper.setEndPage(Integer.MAX_VALUE);
- stripper.writeText(pdDocument, sw);
- }
- finally
- {
- if (pdDocument != null)
- try
- {
- pdDocument.close();
- }
- catch (IOException e)
- {
- }
- if (is != null)
- try
- {
- is.close();
- }
- catch (IOException e)
- {
- }
- }
- return sw.toString();
- }
- });
-
- }
- catch (PrivilegedActionException pae)
- {
- Throwable cause = pae.getCause();
- if (cause instanceof IOException)
+ try
{
- throw (IOException)cause;
+ pdDocument = PDDocument.load(is);
}
- else if (cause instanceof RuntimeException)
+ catch (IOException e)
{
- throw (RuntimeException)cause;
+ throw new DocumentReadException("Can not load PDF document.", e);
}
- else
- {
- throw new RuntimeException(cause);
- }
+
+ PDFTextStripper stripper = new PDFTextStripper();
+ stripper.setStartPage(1);
+ stripper.setEndPage(Integer.MAX_VALUE);
+ stripper.writeText(pdDocument, sw);
}
-
+ finally
+ {
+ if (pdDocument != null)
+ try
+ {
+ pdDocument.close();
+ }
+ catch (IOException e)
+ {
+ }
+ if (is != null)
+ try
+ {
+ is.close();
+ }
+ catch (IOException e)
+ {
+ }
+ }
+ return sw.toString();
}
public String getContentAsText(InputStream is, String encoding) throws IOException,
DocumentReadException
@@ -168,204 +131,269 @@
*/
public Properties getProperties(InputStream is) throws IOException,
DocumentReadException
{
-
- Properties props = null;
-
- PdfReader reader = new PdfReader(is, "".getBytes());
-
- // Read the file metadata
- byte[] metadata = reader.getMetadata();
-
- if (metadata != null)
+ PDDocument pdDocument = PDDocument.load(is);
+ Properties props = new Properties();
+ try
{
- // there is XMP metadata try exctract it
- props = getPropertiesFromMetadata(metadata);
- }
-
- if (props == null)
- {
- // it's old pdf document version
- props = getPropertiesFromInfo(reader.getInfo());
- }
- reader.close();
- if (is != null)
- try
+ if (pdDocument.isEncrypted())
{
- is.close();
+ try
+ {
+ pdDocument.decrypt("");
+ }
+ catch (InvalidPasswordException e)
+ {
+ throw new DocumentReadException("The pdf document is
encrypted.", e);
+ }
+ catch (org.apache.pdfbox.exceptions.CryptographyException e)
+ {
+ throw new DocumentReadException(e.getMessage(), e);
+ }
}
- catch (IOException e)
+
+ PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
+ PDMetadata meta = catalog.getMetadata();
+ if (meta != null)
{
- }
- return props;
- }
+ XMPMetadata metadata = meta.exportXMPMetadata();
- /**
- * Extract properties from XMP xml.
- *
- * @param metadata XML as byte array
- * @return extracted properties
- * @throws DocumentReadException
- * @throws Exception if extracting fails
- */
- protected Properties getPropertiesFromMetadata(byte[] metadata) throws IOException,
DocumentReadException
- {
+ XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
+ if (dc != null)
+ {
+ try
+ {
+ if (dc.getTitle() != null)
+ props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle()));
+ }
+ catch (Exception e)
+ {
+ log.warn("getTitle failed: " + e);
+ }
+ try
+ {
+ if (dc.getDescription() != null)
+ props.put(DCMetaData.SUBJECT, fixEncoding(dc.getDescription()));
+ }
+ catch (Exception e)
+ {
+ log.warn("getSubject failed: " + e);
+ }
- Properties props = null;
+ try
+ {
+ if (dc.getCreators() != null)
+ {
+ List<String> list = dc.getCreators();
+ for (String creator : list)
+ {
+ props.put(DCMetaData.CREATOR, fixEncoding(creator));
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ log.warn("getCreator failed: " + e);
+ }
- // parse xml
+ try
+ {
+ if (dc.getDates() != null)
+ {
+ List<Calendar> list = dc.getDates();
- Document doc;
- try
- {
- DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
- DocumentBuilder docBuilder = dbf.newDocumentBuilder();
- doc = docBuilder.parse(new ByteArrayInputStream(metadata));
- }
- catch (SAXException e)
- {
- throw new DocumentReadException(e.getMessage(), e);
- }
- catch (ParserConfigurationException e)
- {
- throw new DocumentReadException(e.getMessage(), e);
- }
+ for (Calendar date : list)
+ {
+ props.put(DCMetaData.DATE, date);
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ log.warn("getDate failed: " + e);
+ }
+ }
- // Check is there PDF/A-1 XMP
- String version = "";
- NodeList list = doc.getElementsByTagName("pdfaid:conformance");
- if (list != null && list.item(0) != null)
- {
- version += list.item(0).getTextContent() + "-";
- }
+ XMPSchemaBasic basic = metadata.getBasicSchema();
+ if (basic != null)
+ {
+ try
+ {
+ if (basic.getCreateDate() != null)
+ props.put(DCMetaData.DATE, basic.getCreateDate());
+ }
+ catch (Exception e)
+ {
+ log.warn("getCreationDate failed: " + e);
+ }
+ try
+ {
+ if (basic.getModifyDate() != null)
+ props.put(DCMetaData.DATE, basic.getModifyDate());
+ }
+ catch (Exception e)
+ {
+ log.warn("getModificationDate failed: " + e);
+ }
+ }
+ }
- list = doc.getElementsByTagName("pdfaid:part");
- if (list != null && list.item(0) != null)
- {
- version += list.item(0).getTextContent();
- }
+ if (props.isEmpty())
+ {
+ // The pdf doesn't contain any XMP metadata or XMP metadata do not
contains any
+ // usefull data, try to use the document information instead
+ PDDocumentInformation docInfo = pdDocument.getDocumentInformation();
- // PDF/A-1a or PDF/A-1b
- if (version.equalsIgnoreCase("A-1"))
- {
- props = getPropsFromPDFAMetadata(doc);
- }
+ if (docInfo != null)
+ {
+ try
+ {
+ if (docInfo.getCreationDate() != null)
+ props.put(DCMetaData.DATE, docInfo.getCreationDate());
+ }
+ catch (Exception e)
+ {
+ log.warn("getCreationDate failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getCreator() != null)
+ props.put(DCMetaData.CREATOR, docInfo.getCreator());
+ }
+ catch (Exception e)
+ {
+ log.warn("getCreator failed: " + e);
+ }
+ try
+ {
- return props;
- }
-
- /**
- * Extracts properties from PDF Info hash set.
- *
- * @param Pdf Info hash set
- * @return Extracted properties
- * @throws Exception if extracting fails
- */
- @SuppressWarnings("unchecked")
- protected Properties getPropertiesFromInfo(HashMap info) throws IOException
- {
- Properties props = new Properties();
-
- String title = (String)info.get("Title");
- if (title != null)
- {
- props.put(DCMetaData.TITLE, title);
+ if (docInfo.getKeywords() != null)
+ props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
+ }
+ catch (Exception e)
+ {
+ log.warn("getKeywords failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getModificationDate() != null)
+ props.put(DCMetaData.DATE, docInfo.getModificationDate());
+ }
+ catch (Exception e)
+ {
+ log.warn("getModificationDate failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getSubject() != null)
+ props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
+ }
+ catch (Exception e)
+ {
+ log.warn("getSubject failed: " + e);
+ }
+ try
+ {
+ if (docInfo.getTitle() != null)
+ props.put(DCMetaData.TITLE, docInfo.getTitle());
+ }
+ catch (Exception e)
+ {
+ log.warn("getTitle failed: " + e);
+ }
+ }
+ }
}
-
- String author = (String)info.get("Author");
- if (author != null)
+ finally
{
- props.put(DCMetaData.CREATOR, author);
+ if (pdDocument != null)
+ {
+ pdDocument.close();
+ }
}
- String subject = (String)info.get("Subject");
- if (subject != null)
- {
- props.put(DCMetaData.SUBJECT, subject);
- }
-
- String creationDate = (String)info.get("CreationDate");
- if (creationDate != null)
- {
- props.put(DCMetaData.DATE, PdfDate.decode(creationDate));
- }
-
- String modDate = (String)info.get("ModDate");
- if (modDate != null)
- {
- props.put(DCMetaData.DATE, PdfDate.decode(modDate));
- }
-
return props;
}
- private Properties getPropsFromPDFAMetadata(Document doc) throws IOException,
DocumentReadException
+ private String fixEncoding(String str) throws DocumentReadException
{
- Properties props = new Properties();
- // get properties
- NodeList list = doc.getElementsByTagName("rdf:li");
- if (list != null && list.getLength() > 0)
+ try
{
- for (int i = 0; i < list.getLength(); i++)
+ String encoding = null;
+ int orderMaskOffset = 0;
+
+ if (str.startsWith("\\000\\000\\376\\377"))
{
-
- Node n = list.item(i);
- // dc:title - TITLE
- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:title"))
- {
- String title = n.getLastChild().getTextContent();
- props.put(DCMetaData.TITLE, title);
- }
-
- // dc:creator - CREATOR
- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:creator"))
- {
- String author = n.getLastChild().getTextContent();
- props.put(DCMetaData.CREATOR, author);
- }
-
- // DC:description - SUBJECT
- if
(n.getParentNode().getParentNode().getNodeName().equals("dc:description"))
- {
- String description = n.getLastChild().getTextContent();
- props.put(DCMetaData.SUBJECT, description);
- // props.put(DCMetaData.DESCRIPTION, description);
- }
+ encoding = "UTF-32BE";
+ orderMaskOffset = 16;
}
- }
-
- try
- {
- // xmp:CreateDate - DATE
- list = doc.getElementsByTagName("xmp:CreateDate");
- if (list != null && list.item(0) != null)
+ else if (str.startsWith("\\377\\376\\000\\000"))
{
- Node creationDateNode = list.item(0).getLastChild();
- if (creationDateNode != null)
- {
- String creationDate = creationDateNode.getTextContent();
- Calendar c = ISO8601.parseEx(creationDate);
- props.put(DCMetaData.DATE, c);
- }
+ encoding = "UTF-32LE";
+ orderMaskOffset = 16;
}
+ else if (str.startsWith("\\376\\377"))
+ {
+ encoding = "UTF-16BE";
+ orderMaskOffset = 8;
+ }
+ else if (str.startsWith("\\377\\376"))
+ {
+ encoding = "UTF-16LE";
+ orderMaskOffset = 8;
+ }
- // xmp:ModifyDate - DATE
- list = doc.getElementsByTagName("xmp:ModifyDate");
- if (list != null && list.item(0) != null)
+ if (encoding == null)
{
- Node modifyDateNode = list.item(0).getLastChild();
- if (modifyDateNode != null)
+ // return default
+ return str;
+ }
+ else
+ {
+ int i = orderMaskOffset, len = str.length();
+ char c;
+ StringBuilder sb = new StringBuilder(len);
+ while (i < len)
{
- String modifyDate = modifyDateNode.getTextContent();
- Calendar c = ISO8601.parseEx(modifyDate);
- props.put(DCMetaData.DATE, c);
+ c = str.charAt(i++);
+ if (c == '\\')
+ {
+ if (i + 3 <= len)
+ {
+ //extract octal-code
+ try
+ {
+ c = (char)Integer.parseInt(str.substring(i, i + 3), 8);
+ i += 3;
+ }
+ catch (NumberFormatException e)
+ {
+ if (log.isDebugEnabled())
+ {
+ log.debug(
+ "PDF metadata exctraction warning: can not decode
octal code - "
+ + str.substring(i - 1, i + 3) + ".", e);
+ }
+ }
+ }
+ else
+ {
+ if (log.isDebugEnabled())
+ {
+ log.debug("PDF metadata exctraction warning: octal code is
not complete - "
+ + str.substring(i - 1, len));
+ }
+ }
+ }
+ sb.append(c);
}
+
+ byte[] bytes = sb.toString().getBytes();
+ return new String(bytes, encoding);
}
}
- catch (ParseException e)
+ catch (UnsupportedEncodingException e)
{
throw new DocumentReadException(e.getMessage(), e);
}
- return props;
}
-
}
Modified:
core/branches/2.3.x/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java
===================================================================
---
core/branches/2.3.x/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java 2011-06-09
14:54:48 UTC (rev 4494)
+++
core/branches/2.3.x/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java 2011-06-10
08:31:59 UTC (rev 4495)
@@ -18,7 +18,6 @@
*/
package org.exoplatform.services.document.test;
-import org.exoplatform.commons.utils.ISO8601;
import org.exoplatform.services.document.DCMetaData;
import org.exoplatform.services.document.DocumentReader;
import org.exoplatform.services.document.DocumentReaderService;
@@ -34,6 +33,22 @@
{
DocumentReaderService service;
+ private void evalProps(Properties etalon, Properties testedProps, boolean testSize)
+ {
+ Iterator it = etalon.entrySet().iterator();
+ while (it.hasNext())
+ {
+ Map.Entry prop = (Map.Entry)it.next();
+ Object tval = testedProps.get(prop.getKey());
+ assertNotNull(prop.getKey() + " property not founded. ", tval);
+ assertEquals(prop.getKey() + " property value is incorrect",
prop.getValue(), tval);
+ }
+ if (testSize)
+ {
+ assertEquals("size is incorrect", etalon.size(), testedProps.size());
+ }
+ }
+
@Override
public void setUp() throws Exception
{
@@ -41,20 +56,21 @@
service =
(DocumentReaderService)getComponentInstanceOfType(DocumentReaderService.class);
}
- public void testPDFDocumentReaderServiceXMPMetadata() throws Exception
+ public void testExcelDocumentReaderService() throws Exception
{
- InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/MyTest.pdf");
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.xls");
try
{
- DocumentReader rdr = service.getDocumentReader("application/pdf");
- Properties testprops = rdr.getProperties(is);
+ Properties props =
service.getDocumentReader("application/excel").getProperties(is);
Properties etalon = new Properties();
- etalon.put(DCMetaData.TITLE, "Test de convertion de fichier tif");
- etalon.put(DCMetaData.CREATOR, "Christian Klaus");
- etalon.put(DCMetaData.SUBJECT, "20080901 TEST Christian Etat OK");
- Calendar c = ISO8601.parseEx("2008-09-01T08:01:10+00:00");
- etalon.put(DCMetaData.DATE, c);
- evalProps(etalon, testprops);
+ etalon.put(DCMetaData.TITLE, "test-Title");
+ etalon.put(DCMetaData.DATE, new Date(1283247293000L));
+ etalon.put(DCMetaData.SUBJECT, "test-Subject");
+ etalon.put(DCMetaData.CREATOR, "KHANH NGUYEN GIA");
+ etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
+ etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
+
+ evalProps(etalon, props, true);
}
finally
{
@@ -62,20 +78,25 @@
}
}
- public void testWordDocumentReaderService() throws Exception
+ public void testOODocumentReaderService() throws Exception
{
- InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.doc");
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.odt");
try
{
- Properties props =
service.getDocumentReader("application/msword").getProperties(is);
+ Properties props =
service.getDocumentReader("application/vnd.oasis.opendocument.text").getProperties(is);
Properties etalon = new Properties();
+ Calendar date = Calendar.getInstance();
+ date.setTimeInMillis(0);
+ date.set(2010, 7, 31, 14, 13, 23);
+
etalon.put(DCMetaData.TITLE, "test-Title");
- etalon.put(DCMetaData.DATE, new Date(1283247060000L));
+ etalon.put(DCMetaData.LANGUAGE, "ru-RU");
+ etalon.put(DCMetaData.DATE, "2010-08-31T14:13:23");
etalon.put(DCMetaData.SUBJECT, "test-Subject");
- etalon.put(DCMetaData.CREATOR, "Max Yakimenko");
- etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
+ etalon.put(DCMetaData.CREATOR, "nikolaz ");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+
+ evalProps(etalon, props, true);
}
finally
{
@@ -83,6 +104,92 @@
}
}
+ public void testPDFDocumentReaderService() throws Exception
+ {
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.pdf");
+ DocumentReader rdr = service.getDocumentReader("application/pdf");
+ Properties props = rdr.getProperties(is);
+ assertTrue(props.isEmpty());
+
+ // Properties etalon = new Properties();
+ // etalon.put(DCMetaData.PUBLISHER, "FOP 0.20.4");
+ // evalProps(etalon, props, false);
+ }
+
+ public void testPDFDocumentReaderServiceBrokenFile() throws Exception
+ {
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/pfs_accapp.pdf");
+ try
+ {
+
+ DocumentReader rdr = service.getDocumentReader("application/pdf");
+ Properties testprops = rdr.getProperties(is);
+ Properties etalon = new Properties();
+ etalon.put(DCMetaData.TITLE, "Personal Account Opening Form VN");
+ etalon.put(DCMetaData.CREATOR, "mr");
+ evalProps(etalon, testprops, false);
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
+ public void testPDFDocumentReaderServiceMetro() throws Exception
+ {
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/metro.pdf");
+ try
+ {
+
+ DocumentReader rdr = service.getDocumentReader("application/pdf");
+ Properties testprops = rdr.getProperties(is);
+ Properties etalon = new Properties();
+ etalon.put(DCMetaData.TITLE, "metro");
+ etalon.put(DCMetaData.CREATOR, "Preview");
+ evalProps(etalon, testprops, false);
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
+ public void testPDFDocumentReaderServiceXMPUsecase1() throws Exception
+ {
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/MyTest.pdf");
+ DocumentReader rdr = service.getDocumentReader("application/pdf");
+
+ Properties testprops = rdr.getProperties(is);
+ // printProps(testprops);
+
+ Properties etalon = new Properties();
+ etalon.put(DCMetaData.TITLE, "Test de convertion de fichier tif");
+ etalon.put(DCMetaData.CREATOR, "Christian Klaus");
+ etalon.put(DCMetaData.SUBJECT, "20080901 TEST Christian Etat OK");
+ // Calendar c = ISO8601.parseEx("2008-09-01T08:01:10+00:00");
+ // etalon.put(DCMetaData.DATE, c);
+
+ evalProps(etalon, testprops, false);
+ }
+
+ public void testPDFDocumentReaderServiceXMPUsecase2() throws Exception
+ {
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/Trait_union.06.Mai_2009.pdf");
+ try
+ {
+ DocumentReader rdr = service.getDocumentReader("application/pdf");
+ Properties testprops = rdr.getProperties(is);
+ Properties etalon = new Properties();
+ etalon.put(DCMetaData.TITLE, "journal interne mai 2009.qxp");
+ etalon.put(DCMetaData.CREATOR, "presse");
+ evalProps(etalon, testprops, false);
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
public void testPPTDocumentReaderService() throws Exception
{
InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.ppt");
@@ -96,7 +203,7 @@
etalon.put(DCMetaData.CREATOR, "Max Yakimenko");
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -104,21 +211,20 @@
}
}
- public void testExcelDocumentReaderService() throws Exception
+ public void testWordDocumentReaderService() throws Exception
{
- InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.xls");
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.doc");
try
{
- Properties props =
service.getDocumentReader("application/excel").getProperties(is);
+ Properties props =
service.getDocumentReader("application/msword").getProperties(is);
Properties etalon = new Properties();
etalon.put(DCMetaData.TITLE, "test-Title");
- etalon.put(DCMetaData.DATE, new Date(1283247293000L));
+ etalon.put(DCMetaData.DATE, new Date(1283247060000L));
etalon.put(DCMetaData.SUBJECT, "test-Subject");
- etalon.put(DCMetaData.CREATOR, "KHANH NGUYEN GIA");
+ etalon.put(DCMetaData.CREATOR, "Max Yakimenko");
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
-
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -126,27 +232,27 @@
}
}
- public void testXWordDocumentReaderService() throws Exception
+ public void testXExcelDocumentReaderService() throws Exception
{
- InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.docx");
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.xlsx");
try
{
Properties props =
-
service.getDocumentReader("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+
service.getDocumentReader("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
.getProperties(is);
Properties etalon = new Properties();
Calendar date = Calendar.getInstance();
date.setTimeInMillis(0);
- date.set(2010, 7, 31, 7, 53, 0);
+ date.set(2010, 7, 31, 8, 7, 25);
etalon.put(DCMetaData.TITLE, "test-Title");
etalon.put(DCMetaData.DATE, date.getTime());
- etalon.put(DCMetaData.SUBJECT, "Subject");
- etalon.put(DCMetaData.CREATOR, "nikolaz");
+ etalon.put(DCMetaData.SUBJECT, "test-Subject");
+ etalon.put(DCMetaData.CREATOR, "KHANH NGUYEN GIA");
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -174,7 +280,7 @@
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -182,27 +288,27 @@
}
}
- public void testXExcelDocumentReaderService() throws Exception
+ public void testXWordDocumentReaderService() throws Exception
{
- InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.xlsx");
+ InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.docx");
try
{
Properties props =
-
service.getDocumentReader("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+
service.getDocumentReader("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
.getProperties(is);
Properties etalon = new Properties();
Calendar date = Calendar.getInstance();
date.setTimeInMillis(0);
- date.set(2010, 7, 31, 8, 7, 25);
+ date.set(2010, 7, 31, 7, 53, 0);
etalon.put(DCMetaData.TITLE, "test-Title");
etalon.put(DCMetaData.DATE, date.getTime());
- etalon.put(DCMetaData.SUBJECT, "test-Subject");
- etalon.put(DCMetaData.CREATOR, "KHANH NGUYEN GIA");
+ etalon.put(DCMetaData.SUBJECT, "Subject");
+ etalon.put(DCMetaData.CREATOR, "nikolaz");
etalon.put(DCMetaData.CONTRIBUTOR, "Max Yakimenko");
etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
- evalProps(etalon, props);
+ evalProps(etalon, props, true);
}
finally
{
@@ -210,47 +316,4 @@
}
}
- public void testOODocumentReaderService() throws Exception
- {
- InputStream is =
TestPropertiesExtracting.class.getResourceAsStream("/test.odt");
- try
- {
- Properties props =
service.getDocumentReader("application/vnd.oasis.opendocument.text").getProperties(is);
- Properties etalon = new Properties();
- Calendar date = Calendar.getInstance();
- date.setTimeInMillis(0);
- date.set(2010, 7, 31, 14, 13, 23);
-
- etalon.put(DCMetaData.TITLE, "test-Title");
- etalon.put(DCMetaData.LANGUAGE, "ru-RU");
- etalon.put(DCMetaData.DATE, "2010-08-31T14:13:23");
- etalon.put(DCMetaData.SUBJECT, "test-Subject");
- etalon.put(DCMetaData.CREATOR, "nikolaz ");
- etalon.put(DCMetaData.DESCRIPTION, "test-Comments");
-
- evalProps(etalon, props);
- }
- finally
- {
- is.close();
- }
- }
-
- private void evalProps(Properties etalon, Properties testedProps)
- {
- Iterator it = etalon.entrySet().iterator();
- while (it.hasNext())
- {
- Map.Entry prop = (Map.Entry)it.next();
- Object tval = testedProps.get(prop.getKey());
- assertNotNull(prop.getKey() + " property not founded. ", tval);
- if (tval instanceof Date)
- {
- System.out.println("was:" + ((Date)tval).getTime() + "
expected: " + ((Date)prop.getValue()).getTime());
- }
- assertEquals(prop.getKey() + " property value is incorrect",
prop.getValue(), tval);
- }
- assertEquals("size is incorrect", etalon.size(), testedProps.size());
- }
-
}
Added:
core/branches/2.3.x/exo.core.component.document/src/test/resources/Trait_union.06.Mai_2009.pdf
===================================================================
(Binary files differ)
Property changes on:
core/branches/2.3.x/exo.core.component.document/src/test/resources/Trait_union.06.Mai_2009.pdf
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Added: core/branches/2.3.x/exo.core.component.document/src/test/resources/metro.pdf
===================================================================
(Binary files differ)
Property changes on:
core/branches/2.3.x/exo.core.component.document/src/test/resources/metro.pdf
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Added: core/branches/2.3.x/exo.core.component.document/src/test/resources/pfs_accapp.pdf
===================================================================
(Binary files differ)
Property changes on:
core/branches/2.3.x/exo.core.component.document/src/test/resources/pfs_accapp.pdf
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Modified:
core/branches/2.3.x/packaging/module/src/main/javascript/core.packaging.module.js
===================================================================
---
core/branches/2.3.x/packaging/module/src/main/javascript/core.packaging.module.js 2011-06-09
14:54:48 UTC (rev 4494)
+++
core/branches/2.3.x/packaging/module/src/main/javascript/core.packaging.module.js 2011-06-10
08:31:59 UTC (rev 4495)
@@ -39,6 +39,7 @@
module.component.documents =
new Project("org.exoplatform.core",
"exo.core.component.document", "jar", module.version).
addDependency(new Project("org.apache.pdfbox", "pdfbox",
"jar", "1.1.0")).
+ addDependency(new Project("org.apache.pdfbox", "jempbox",
"jar", "1.1.0")).
addDependency(new Project("com.lowagie", "itext",
"jar", "2.1.0")).
addDependency(new Project("bouncycastle", "bcmail-jdk14",
"jar", "136")).
addDependency(new Project("bouncycastle", "bcprov-jdk14",
"jar", "136")).
Added: core/branches/2.3.x/patch/2.3.8/COR-228/readme.txt
===================================================================
--- core/branches/2.3.x/patch/2.3.8/COR-228/readme.txt (rev 0)
+++ core/branches/2.3.x/patch/2.3.8/COR-228/readme.txt 2011-06-10 08:31:59 UTC (rev 4495)
@@ -0,0 +1,71 @@
+Summary
+
+ Status: Make PdfDocumentReader.getProperties() use PdfBox instead of iText
+ CCP Issue: N/A, Product Jira Issue: COR-228.
+ Complexity: hard
+
+The Proposal
+Problem description
+
+What is the problem to fix?
+
+ Implementation that uses iText does not support many XMP metadata. Make new
implementation of PdfDocumentReader.getProperties() using PdfBox instead of iText.
+
+Fix description
+
+How is the problem fixed?
+
+ Use PdfBox to extract XMP metadata.
+ iText was removed from code.
+
+Patch information:
+Patch file(s): COR-228.patch
+
+Tests to perform:
+Test performed at Support level
+Test on 3 pdf files (metro.pdf, pfs_accapp.pdf, Train_union.06.Mai_2009.pdf): using
Webdav or ECMS Content explorer (in a Content folder)
+
+ Upload file from local to eXo PLF server
+ Copy-paste file inside JCR folders
+ Check how its name and title display
+
+Tests performed at DevLevel
+
+ Add these 3 PDF files into src/test/resources
+ TestPropertiesExtraction and other core.document tests
+
+Tests performed at QA
+*
+
+Documentation changes
+
+Documentation changes:
+ none
+
+Configuration changes
+
+Configuration changes:
+ none
+
+Will previous configuration continue to work?
+ yes
+
+Risks and impacts
+
+Can this bug fix have any side effects on current client projects?
+ The behavior of PDFDocumentReader.getProperties based on PdfBox may be different than
based on iText
+
+Is there a performance risk/cost?
+ none
+
+Validation (PM/Support/QA)
+
+PM Comment
+* Patch approved by the PL
+
+Support Comment
+* Support review : patch validated
+
+QA Feedbacks
+*
+