Author: dkuleshov
Date: 2010-12-21 03:56:24 -0500 (Tue, 21 Dec 2010)
New Revision: 3690
Added:
core/trunk/exo.core.component.document/src/test/resources/testEXCEL.xls
core/trunk/exo.core.component.document/src/test/resources/testEXCEL.xlsb
core/trunk/exo.core.component.document/src/test/resources/testPPT.potm
core/trunk/exo.core.component.document/src/test/resources/testPPT.ppsm
core/trunk/exo.core.component.document/src/test/resources/testPPT.ppsx
core/trunk/exo.core.component.document/src/test/resources/testPPT.pptm
core/trunk/exo.core.component.document/src/test/resources/testRTF.rtf
core/trunk/exo.core.component.document/src/test/resources/testWORD.docm
core/trunk/exo.core.component.document/src/test/resources/testWORD.dotm
core/trunk/exo.core.component.document/src/test/resources/testWORD.dotx
core/trunk/exo.core.component.document/src/test/resources/testXHTML.html
Modified:
core/trunk/exo.core.component.document/pom.xml
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/HTMLDocumentReader.java
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSExcelDocumentReader.java
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXExcelDocumentReader.java
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXPPTDocumentReader.java
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXWordDocumentReader.java
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PPTDocumentReader.java
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/TextPlainDocumentReader.java
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestHtmlDocumentReader.java
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSExcelDocumentReader.java
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSXPPTDocumentReader.java
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSXWordDocumentReader.java
Log:
EXOJCR-1114: provided support for more MIME types
Modified: core/trunk/exo.core.component.document/pom.xml
===================================================================
--- core/trunk/exo.core.component.document/pom.xml 2010-12-21 08:26:13 UTC (rev 3689)
+++ core/trunk/exo.core.component.document/pom.xml 2010-12-21 08:56:24 UTC (rev 3690)
@@ -129,6 +129,15 @@
<include>**/*.msg</include>
<include>**/*.pst</include>
<include>**/*.policy</include>
+ <include>**/*.rtf</include>
+ <include>**/*.dotx</include>
+ <include>**/*.docm</include>
+ <include>**/*.dotm</include>
+ <include>**/*.xlsb</include>
+ <include>**/*.pptm</include>
+ <include>**/*.ppsx</include>
+ <include>**/*.ppsm</include>
+ <include>**/*.potm</include>
</includes>
</testResource>
</testResources>
Modified:
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/HTMLDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/HTMLDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/HTMLDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -53,7 +53,7 @@
public HTMLDocumentReader()
{
}
-
+
/**
* Get the text/html,application/x-groovy+html mime type.
*
@@ -61,7 +61,7 @@
*/
public String[] getMimeTypes()
{
- return new String[]{"text/html", "application/x-groovy+html"};
+ return new String[]{"text/html", "application/x-groovy+html",
"application/xhtml+xml"};
}
/**
Modified:
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSExcelDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSExcelDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSExcelDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -53,7 +53,7 @@
*/
public String[] getMimeTypes()
{
- return new String[]{"application/excel", "application/xls"};
+ return new String[]{"application/excel", "application/xls",
"application/vnd.ms-excel"};
}
/**
Modified:
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXExcelDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXExcelDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXExcelDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -53,6 +53,15 @@
*/
public String[] getMimeTypes()
{
+ //Supported mimetypes:
+ // "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" -
"x.xlsx"
+ //
+ //Unsupported mimetypes:
+ // "application/vnd.ms-excel.sheet.binary.macroenabled.12" -
"*.xlsb"; There is exceptions at parsing
+ // "application/vnd.openxmlformats-officedocument.spreadsheetml.template"
- "x.xltx"; Not tested
+ // "application/vnd.ms-excel.sheet.macroenabled.12" - "x.xlsm";
Not tested
+ // "application/vnd.ms-excel.template.macroenabled.12" -
"x.xltm"; Not tested
+ // "application/vnd.ms-excel.addin.macroenabled.12" - "x.xlam";
Not tested
return new
String[]{"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"};
}
Modified:
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXPPTDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXPPTDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXPPTDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -51,7 +51,21 @@
*/
public String[] getMimeTypes()
{
- return new
String[]{"application/vnd.openxmlformats-officedocument.presentationml.presentation"};
+ //Supported mimetypes:
+ //
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
-"x.pptx";
+ //
"application/vnd.openxmlformats-officedocument.presentationml.slideshow" -
"x.ppsx";
+ // "application/vnd.ms-powerpoint.presentation.macroenabled.12" -
"testPPT.pptm";
+ // "application/vnd.ms-powerpoint.slideshow.macroenabled.12" -
"testPPT.ppsm";
+ //
+ //Not supported mimetypes:
+ // "application/vnd.ms-powerpoint.template.macroenabled.12" -
"testPPT.potm"; Has errors
+ //
"application/vnd.openxmlformats-officedocument.presentationml.template" -
"x.potx"; Not tested
+ // "application/vnd.ms-powerpoint.addin.macroenabled.12" -
"x.ppam"; Not tested
+
+ return new
String[]{"application/vnd.openxmlformats-officedocument.presentationml.presentation",
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+ "application/vnd.ms-powerpoint.presentation.macroenabled.12",
+ "application/vnd.ms-powerpoint.slideshow.macroenabled.12"};
}
/**
Modified:
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXWordDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXWordDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/MSXWordDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -47,7 +47,15 @@
*/
public String[] getMimeTypes()
{
- return new
String[]{"application/vnd.openxmlformats-officedocument.wordprocessingml.document"};
+ //Supported document types:
+ //
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" -
"x.docx"
+ //
"application/vnd.openxmlformats-officedocument.wordprocessingml.template" -
"x.dotx"
+ // "application/vnd.ms-word.document.macroenabled.12" -
"x.docm"
+ // "application/vnd.ms-word.template.macroenabled.12" -
"x.dotm"
+
+ return new
String[]{"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+ "application/vnd.ms-word.document.macroenabled.12",
"application/vnd.ms-word.template.macroenabled.12"};
}
/**
Modified:
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PPTDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PPTDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/PPTDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -43,7 +43,7 @@
*/
public String[] getMimeTypes()
{
- return new String[]{"application/powerpoint",
"application/ppt"};
+ return new String[]{"application/powerpoint",
"application/ppt", "application/vnd.ms-powerpoint"};
}
/**
Modified:
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/TextPlainDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/TextPlainDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/main/java/org/exoplatform/services/document/impl/TextPlainDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -67,10 +67,10 @@
*/
public String[] getMimeTypes()
{
- return new String[]{"text/plain","script/groovy",
-
"application/x-groovy","application/x-javascript",
-
"application/javascript","text/javascript",
- "application/x-jaxrs+groovy"};
+ return new String[]{"text/plain", "script/groovy",
"application/x-groovy", "application/x-javascript",
+ "application/javascript", "text/javascript",
"application/x-jaxrs+groovy"};
+ // "text/rtf", "application/rtf" excluded since there must be
RTF parser - because plain text contains a lot formatting tags.
+
}
/**
Modified:
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestHtmlDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestHtmlDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestHtmlDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -58,4 +58,20 @@
is.close();
}
}
+
+ public void testXHTMLGetContentAsString() throws Exception
+ {
+ InputStream is =
TestHtmlDocumentReader.class.getResourceAsStream("/testXHTML.html");
+ try
+ {
+ DocumentReader dr =
service.getDocumentReader("application/xhtml+xml");
+ String text = dr.getContentAsText(is);
+ assertTrue(text
+ .contains("This document tests the ability of Apache Tika to extract
content from an XHTML document."));
+ }
+ finally
+ {
+ is.close();
+ }
+ }
}
Modified:
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSExcelDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSExcelDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSExcelDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -46,6 +46,24 @@
service.addDocumentReader(new MSExcelDocumentReader());
}
+ public void testClassicExcelGetContentAsString() throws Exception
+ {
+ InputStream is =
TestMSXExcelDocumentReader.class.getResourceAsStream("/testEXCEL.xls");
+ try
+ {
+ String content =
service.getDocumentReader("application/vnd.ms-excel").getContentAsText(is);
+ assertTrue(content.contains("Sample Excel Worksheet"));
+ assertTrue(content.contains("Numbers and their Squares"));
+ assertTrue(content.contains("Number"));
+ assertTrue(content.contains("9"));
+ assertFalse(content.contains("9.0"));
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
public void testGetContentAsString() throws Exception
{
InputStream is =
TestMSExcelDocumentReader.class.getResourceAsStream("/test.xls");
Modified:
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSXPPTDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSXPPTDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSXPPTDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -61,4 +61,72 @@
is.close();
}
}
+
+ public void testPPSXGetContentAsString() throws Exception
+ {
+ InputStream is =
TestMSXPPTDocumentReader.class.getResourceAsStream("/testPPT.ppsx");
+ try
+ {
+ String content =
+
service.getDocumentReader("application/vnd.openxmlformats-officedocument.presentationml.slideshow")
+ .getContentAsText(is);
+ assertTrue(content
+ .contains("This is a test file data with the same content as every other
file being tested for"));
+ assertTrue(content.contains("Different words to test against"));
+ assertTrue(content.contains("Quest"));
+ assertTrue(content.contains("Hello"));
+ assertTrue(content.contains("Watershed"));
+ assertTrue(content.contains("Avalanche"));
+ assertTrue(content.contains("Black Panther"));
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
+ public void testPPTMGetContentAsString() throws Exception
+ {
+ InputStream is =
TestMSXPPTDocumentReader.class.getResourceAsStream("/testPPT.pptm");
+ try
+ {
+ String content =
+
service.getDocumentReader("application/vnd.ms-powerpoint.presentation.macroenabled.12")
+ .getContentAsText(is);
+ assertTrue(content
+ .contains("This is a test file data with the same content as every other
file being tested for"));
+ assertTrue(content.contains("Different words to test against"));
+ assertTrue(content.contains("Quest"));
+ assertTrue(content.contains("Hello"));
+ assertTrue(content.contains("Watershed"));
+ assertTrue(content.contains("Avalanche"));
+ assertTrue(content.contains("Black Panther"));
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
+ public void testPPSMGetContentAsString() throws Exception
+ {
+ InputStream is =
TestMSXPPTDocumentReader.class.getResourceAsStream("/testPPT.ppsm");
+ try
+ {
+ String content =
+
service.getDocumentReader("application/vnd.ms-powerpoint.slideshow.macroenabled.12").getContentAsText(is);
+ assertTrue(content
+ .contains("This is a test file data with the same content as every other
file being tested for"));
+ assertTrue(content.contains("Different words to test against"));
+ assertTrue(content.contains("Quest"));
+ assertTrue(content.contains("Hello"));
+ assertTrue(content.contains("Watershed"));
+ assertTrue(content.contains("Avalanche"));
+ assertTrue(content.contains("Black Panther"));
+ }
+ finally
+ {
+ is.close();
+ }
+ }
}
Modified:
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSXWordDocumentReader.java
===================================================================
---
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSXWordDocumentReader.java 2010-12-21
08:26:13 UTC (rev 3689)
+++
core/trunk/exo.core.component.document/src/test/java/org/exoplatform/services/document/test/TestMSXWordDocumentReader.java 2010-12-21
08:56:24 UTC (rev 3690)
@@ -40,7 +40,7 @@
service.addDocumentReader(new MSXWordDocumentReader());
}
- public void testGetContentAsStringDoc() throws Exception
+ public void testDOCXGetContentAsStringDoc() throws Exception
{
InputStream is =
TestMSXWordDocumentReader.class.getResourceAsStream("/test.docx");
try
@@ -56,4 +56,50 @@
is.close();
}
}
+
+ public void testDOTXGetContentAsStringDoc() throws Exception
+ {
+ InputStream is =
TestMSXWordDocumentReader.class.getResourceAsStream("/testWORD.dotx");
+ try
+ {
+ String text =
+
service.getDocumentReader("application/vnd.openxmlformats-officedocument.wordprocessingml.template")
+ .getContentAsText(is);
+ assertTrue(text.contains("template"));
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
+ public void testDOCMGetContentAsStringDoc() throws Exception
+ {
+ InputStream is =
TestMSXWordDocumentReader.class.getResourceAsStream("/testWORD.docm");
+ try
+ {
+ String text =
+
service.getDocumentReader("application/vnd.ms-word.document.macroenabled.12").getContentAsText(is);
+ assertTrue(text.contains("template"));
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+
+ public void testDOTMGetContentAsStringDoc() throws Exception
+ {
+ InputStream is =
TestMSXWordDocumentReader.class.getResourceAsStream("/testWORD.dotm");
+ try
+ {
+ String text =
+
service.getDocumentReader("application/vnd.ms-word.template.macroenabled.12").getContentAsText(is);
+ assertTrue(text.contains("Template with macros"));
+ }
+ finally
+ {
+ is.close();
+ }
+ }
}
Added: core/trunk/exo.core.component.document/src/test/resources/testEXCEL.xls
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testEXCEL.xls
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testEXCEL.xlsb
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testEXCEL.xlsb
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testPPT.potm
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testPPT.potm
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testPPT.ppsm
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testPPT.ppsm
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testPPT.ppsx
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testPPT.ppsx
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testPPT.pptm
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testPPT.pptm
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testRTF.rtf
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testRTF.rtf
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testWORD.docm
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testWORD.docm
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testWORD.dotm
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testWORD.dotm
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testWORD.dotx
===================================================================
(Binary files differ)
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testWORD.dotx
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Added: core/trunk/exo.core.component.document/src/test/resources/testXHTML.html
===================================================================
--- core/trunk/exo.core.component.document/src/test/resources/testXHTML.html
(rev 0)
+++ core/trunk/exo.core.component.document/src/test/resources/testXHTML.html 2010-12-21
08:56:24 UTC (rev 3690)
@@ -0,0 +1,29 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+
http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html
xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+ <title>XHTML test document</title>
+ <meta name="Author" content="Tika Developers"/>
+ <meta http-equiv="refresh" content="5"/>
+ </head>
+ <body>
+ <p>
+ This document tests the ability of Apache Tika to extract content
+ from an <a
href="http://www.w3.org/TR/xhtml1/">XHTML
document</a>.
+ </p>
+ </body>
+</html>
\ No newline at end of file
Property changes on:
core/trunk/exo.core.component.document/src/test/resources/testXHTML.html
___________________________________________________________________
Name: svn:eol-style
+ native