[exo-jcr-commits] exo-jcr SVN: r4480 - in core/trunk/exo.core.component.document/src: test/java/org/exoplatform/services/document/test and 1 other directories.

do-not-reply at jboss.org do-not-reply at jboss.org
Mon Jun 6 07:52:06 EDT 2011


Author: sergiykarpenko
Date: 2011-06-06 07:52:05 -0400 (Mon, 06 Jun 2011)
New Revision: 4480

Added:
   core/trunk/exo.core.component.document/src/test/resources/Trait_union.06.Mai_2009.pdf
Modified:
   core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java
   core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java
Log:
EXOJCR-1373: pdf documents metadata UTF-16 encoding support added

Modified: core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java
===================================================================
--- core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java	2011-06-06 10:00:20 UTC (rev 4479)
+++ core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PDFDocumentReader.java	2011-06-06 11:52:05 UTC (rev 4480)
@@ -37,6 +37,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
 import java.security.PrivilegedActionException;
 import java.security.PrivilegedExceptionAction;
 import java.util.Calendar;
@@ -204,7 +205,7 @@
                         try
                         {
                            if (dc.getTitle() != null)
-                              props.put(DCMetaData.TITLE, dc.getTitle());
+                              props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle()));
                         }
                         catch (Exception e)
                         {
@@ -213,7 +214,7 @@
                         try
                         {
                            if (dc.getDescription() != null)
-                              props.put(DCMetaData.DESCRIPTION, dc.getDescription());
+                              props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription()));
                         }
                         catch (Exception e)
                         {
@@ -226,7 +227,7 @@
                            {
                               for (String creator : dc.getCreators())
                               {
-                                 props.put(DCMetaData.CREATOR, creator);
+                                 props.put(DCMetaData.CREATOR, fixEncoding(creator));
                               }
                            }
                         }
@@ -257,7 +258,7 @@
                         try
                         {
                            if (pdf.getKeywords() != null)
-                              props.put(DCMetaData.SUBJECT, pdf.getKeywords());
+                              props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords()));
                         }
                         catch (Exception e)
                         {
@@ -267,7 +268,7 @@
                         try
                         {
                            if (pdf.getProducer() != null)
-                              props.put(DCMetaData.PUBLISHER, pdf.getProducer());
+                              props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer()));
                         }
                         catch (Exception e)
                         {
@@ -296,18 +297,12 @@
                         {
                            log.warn("getModificationDate failed: " + e);
                         }
-                        // try
-                        // {
-                        // if (basic.getCreatorTool() != null)
-                        // props.put(DCMetaData.PUBLISHER, basic.getCreatorTool());
-                        // }
-                        // catch (Exception e)
-                        // {
-                        // log.warn("getCreatorTool failed: " + e);
-                        // }
+
+                        // DCMetaData.PUBLISHER - basic.getCreatorTool()
                      }
                   }
-                  else
+
+                  if (props.isEmpty())
                   {
                      // The pdf doesn't contain any metadata, try to use the document
                      // information instead
@@ -434,4 +429,87 @@
       }
    }
 
+   private String fixEncoding(String str)
+   {
+      try
+      {
+         String encoding = null;
+         int orderMaskOffset = 0;
+
+         if (str.startsWith("\\000\\000\\376\\377"))
+         {
+            encoding = "UTF-32BE";
+            orderMaskOffset = 16;
+         }
+         else if (str.startsWith("\\377\\376\\000\\000"))
+         {
+            encoding = "UTF-32LE";
+            orderMaskOffset = 16;
+         }
+         else if (str.startsWith("\\376\\377"))
+         {
+            encoding = "UTF-16BE";
+            orderMaskOffset = 8;
+         }
+         else if (str.startsWith("\\377\\376"))
+         {
+            encoding = "UTF-16LE";
+            orderMaskOffset = 8;
+         }
+
+         if (encoding == null)
+         {
+            // return default
+            return str;
+         }
+         else
+         {
+            int i = orderMaskOffset, len = str.length();
+            char c;
+            StringBuilder sb = new StringBuilder(len);
+            while (i < len)
+            {
+               c = str.charAt(i++);
+               if (c == '\\')
+               {
+                  if (i + 3 <= len)
+                  {
+                     //extract octal-code
+                     try
+                     {
+                        c = (char)Integer.parseInt(str.substring(i, i + 3), 8);
+                        i += 3;
+                     }
+                     catch (NumberFormatException e)
+                     {
+                        if (log.isDebugEnabled())
+                        {
+                           log.debug(
+                              "PDF metadata exctraction warning: can not decode octal code - "
+                                 + str.substring(i - 1, i + 3) + ".", e);
+                        }
+                     }
+                  }
+                  else
+                  {
+                     if (log.isDebugEnabled())
+                     {
+                        log.debug("PDF metadata exctraction warning: octal code is not complete - "
+                           + str.substring(i - 1, len));
+                     }
+                  }
+               }
+               sb.append(c);
+            }
+
+            byte[] bytes = sb.toString().getBytes();
+            return new String(bytes, encoding);
+         }
+      }
+      catch (UnsupportedEncodingException e)
+      {
+         log.warn("PDF metadata exctraction warning: can not convert metadata string " + str, e);
+         return "";
+      }
+   }
 }

Modified: core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java
===================================================================
--- core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java	2011-06-06 10:00:20 UTC (rev 4479)
+++ core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestPropertiesExtracting.java	2011-06-06 11:52:05 UTC (rev 4480)
@@ -107,6 +107,24 @@
       }
    }
 
+   public void testPDFDocumentReaderServiceXMPUsecase1() throws Exception
+   {
+      InputStream is = TestPropertiesExtracting.class.getResourceAsStream("/Trait_union.06.Mai_2009.pdf");
+      try
+      {
+         DocumentReader rdr = service.getDocumentReader("application/pdf");
+         Properties testprops = rdr.getProperties(is);
+         Properties etalon = new Properties();
+         etalon.put(DCMetaData.TITLE, "journal interne mai 2009.qxp");
+         etalon.put(DCMetaData.CREATOR, "presse");
+         evalProps(etalon, testprops, false);
+      }
+      finally
+      {
+         is.close();
+      }
+   }
+
    public void testWordDocumentReaderService() throws Exception
    {
       InputStream is = TestPropertiesExtracting.class.getResourceAsStream("/test.doc");

Added: core/trunk/exo.core.component.document/src/test/resources/Trait_union.06.Mai_2009.pdf
===================================================================
(Binary files differ)


Property changes on: core/trunk/exo.core.component.document/src/test/resources/Trait_union.06.Mai_2009.pdf
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream



More information about the exo-jcr-commits mailing list