Author: rhauch
Date: 2009-02-23 13:53:59 -0500 (Mon, 23 Feb 2009)
New Revision: 736
Added:
trunk/extensions/dna-sequencer-msoffice/src/test/java/org/jboss/dna/sequencer/msoffice/word/
trunk/extensions/dna-sequencer-msoffice/src/test/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataTest.java
Modified:
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/MSOfficeMetadataSequencer.java
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadata.java
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataReader.java
trunk/extensions/dna-sequencer-msoffice/src/test/resources/word.doc
Log:
DNA-153 Add support for Microsoft Word document metadata
Applied the "Extract_Word_Headers" patch and copied the new "word.doc"
file into the "src/test/resources" folder (see attachments). Looks good; all
tests now pass.
Modified:
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/MSOfficeMetadataSequencer.java
===================================================================
---
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/MSOfficeMetadataSequencer.java 2009-02-23
18:26:27 UTC (rev 735)
+++
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/MSOfficeMetadataSequencer.java 2009-02-23
18:53:59 UTC (rev 736)
@@ -25,6 +25,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.util.Iterator;
import java.util.List;
import org.jboss.dna.graph.sequencer.SequencerContext;
import org.jboss.dna.graph.sequencer.SequencerOutput;
@@ -33,6 +34,7 @@
import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadataReader;
import org.jboss.dna.sequencer.msoffice.powerpoint.PowerPointMetadataReader;
import org.jboss.dna.sequencer.msoffice.powerpoint.SlideMetadata;
+import org.jboss.dna.sequencer.msoffice.word.WordMetadata;
import org.jboss.dna.sequencer.msoffice.word.WordMetadataReader;
/**
@@ -110,6 +112,11 @@
// Excel specific
public static final String EXCEL_FULL_CONTENT = "msoffice:full_contents";
public static final String EXCEL_SHEET_NAME = "msoffice:sheet_name";
+
+ // Word specific
+ public static final String WORD_HEADING_NODE = "msoffice:heading";
+ public static final String WORD_HEADING_NAME = "msoffice:heading_name";
+ public static final String WORD_HEADING_LEVEL = "msoffice:heading_level";
/**
* {@inheritDoc}
@@ -167,7 +174,16 @@
if (mimeType.equals("application/vnd.ms-word")) {
// Sometime in the future this will sequence WORD Table of contents.
try {
- /*WordMetadata wordMetadata =*/WordMetadataReader.invoke(stream);
+ WordMetadata wordMetadata = WordMetadataReader.instance(stream);
+
+ for (Iterator<WordMetadata.WordHeading> iter =
wordMetadata.getHeadings().iterator(); iter.hasNext(); ) {
+ WordMetadata.WordHeading heading = iter.next();
+
+ output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE,
WORD_HEADING_NAME, heading.getText());
+ output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE,
WORD_HEADING_LEVEL, heading.getHeaderLevel());
+
+ }
+
} catch (IOException e) {
// There was an error reading, so log and continue ...
context.getLogger(this.getClass()).debug(e, "Error while extracting
the Word document metadata");
Modified:
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadata.java
===================================================================
---
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadata.java 2009-02-23
18:26:27 UTC (rev 735)
+++
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadata.java 2009-02-23
18:53:59 UTC (rev 736)
@@ -24,10 +24,49 @@
package org.jboss.dna.sequencer.msoffice.word;
+import java.util.List;
+
/**
* @author Michael Trezzi
*/
public class WordMetadata {
+ private List<WordMetadata.WordHeading> headings;
+ public List<WordMetadata.WordHeading> getHeadings() {
+ return headings;
+ }
+
+ public void setHeadings(List<WordMetadata.WordHeading> headings) {
+ this.headings = headings;
+ }
+
+
+ public static class WordHeading {
+ private String text;
+ private int headingLevel;
+
+ public WordHeading(String text, int headerLevel) {
+ super();
+ this.text = text;
+ this.headingLevel = headerLevel;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public void setText(String text) {
+ this.text = text;
+ }
+
+ public int getHeaderLevel() {
+ return headingLevel;
+ }
+
+ public void setHeaderLevel(int headerLevel) {
+ this.headingLevel = headerLevel;
+ }
+
+ }
}
Modified:
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataReader.java
===================================================================
---
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataReader.java 2009-02-23
18:26:27 UTC (rev 735)
+++
trunk/extensions/dna-sequencer-msoffice/src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataReader.java 2009-02-23
18:53:59 UTC (rev 736)
@@ -26,22 +26,65 @@
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.StyleSheet;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.jboss.dna.common.util.Logger;
/**
- * Reades table of contents from Word document
+ * Infers table of contents from Word document by reading all paragraphs
+ * with style <code>Heading*</code>. This is analogous to the default
+ * behavior of Word when generating a table of contents.
*
* @author Michael Trezzi
*/
public class WordMetadataReader {
- @SuppressWarnings( "null" )
- public static WordMetadata invoke( InputStream stream ) throws IOException {
- WordMetadata metadata = new WordMetadata();
+ private static final Logger log = Logger.getLogger(WordMetadataReader.class);
+
+ /** Prefix for styles that will be extracted and treated as outline information for the
document */
+ private static final String HEADER_PREFIX = "Heading";
+
+ public static WordMetadata instance( InputStream stream ) throws IOException {
+ WordMetadata metadata = new WordMetadata();
+ List<WordMetadata.WordHeading> headings = new
ArrayList<WordMetadata.WordHeading>();
+
HWPFDocument document = new HWPFDocument(stream);
- if (document != null) {
- // TODO
+ Range range = document.getRange();
+
+ StyleSheet stylesheet = document.getStyleSheet();
+
+ for (int i = 0; i < range.numParagraphs(); i++) {
+ Paragraph paragraph = range.getParagraph(i);
+
+ String styleName =
stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName();
+
+ if (styleName.startsWith(HEADER_PREFIX)) {
+ String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim();
+ int levelNum = 0;
+
+ try {
+ levelNum = Integer.parseInt(rawLevelNum);
+ }
+ catch (NumberFormatException nfe) {
+ log.debug("Could not parse heading level from: " + styleName);
+ }
+
+ String text = Paragraph.stripFields(paragraph.text());
+
+ if ('\r' == text.charAt(text.length() - 1)) {
+ text = text.substring(0, text.length() - 1);
+ }
+
+ headings.add(new WordMetadata.WordHeading(text, levelNum));
+ }
}
+
+ metadata.setHeadings(headings);
return metadata;
}
}
Added:
trunk/extensions/dna-sequencer-msoffice/src/test/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataTest.java
===================================================================
---
trunk/extensions/dna-sequencer-msoffice/src/test/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataTest.java
(rev 0)
+++
trunk/extensions/dna-sequencer-msoffice/src/test/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataTest.java 2009-02-23
18:53:59 UTC (rev 736)
@@ -0,0 +1,76 @@
+/*
+ * JBoss DNA (
http://www.jboss.org/dna)
+ * See the COPYRIGHT.txt file distributed with this work for information
+ * regarding copyright ownership. Some portions may be licensed
+ * to Red Hat, Inc. under one or more contributor license agreements.
+ * See the AUTHORS.txt file in the distribution for a full listing of
+ * individual contributors.
+ *
+ * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
+ * is licensed to you under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * JBoss DNA is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this software; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA, or see the FSF site:
http://www.fsf.org.
+ */
+
+package org.jboss.dna.sequencer.msoffice.word;
+
+import static org.hamcrest.core.Is.is;
+import org.junit.After;
+import static org.junit.Assert.assertThat;
+import org.junit.Test;
+import java.io.InputStream;
+import java.util.List;
+
+public class WordMetadataTest {
+
+ private static final String[] TEST_HEADERS_TEXT = new String[] {
+ "Test Heading 1", "Test Heading 1.1", "Test Heading 1.2",
"Test Heading 1.2.1",
+ "Test Heading 2", "Test Heading 2.1", "Test Heading 2.2",
+ };
+ private static final int[] TEST_HEADERS_LEVEL = new int[] {
+ 1, 2, 2, 3, 1, 2, 2
+ };
+
+ private WordMetadata wordMetadata;
+ private InputStream imageStream;
+
+ @After
+ public void afterEach() throws Exception {
+ if (imageStream != null) {
+ try {
+ imageStream.close();
+ } finally {
+ imageStream = null;
+ }
+ }
+ }
+
+ protected InputStream getTestDocument( String resourcePath ) {
+ return this.getClass().getResourceAsStream("/" + resourcePath);
+ }
+
+ @Test
+ public void shouldBeAbleToParseHeadingsForWord() throws Exception {
+
+ wordMetadata =
WordMetadataReader.instance(this.getTestDocument("word.doc"));
+ List<WordMetadata.WordHeading> headings = wordMetadata.getHeadings();
+
+ assertThat(headings.size(), is(TEST_HEADERS_TEXT.length));
+
+ for (int i = 0; i < headings.size(); i++) {
+ assertThat(headings.get(i).getText(), is(TEST_HEADERS_TEXT[i]));
+ assertThat(headings.get(i).getHeaderLevel(), is(TEST_HEADERS_LEVEL[i]));
+ }
+
+ }
+}
Property changes on:
trunk/extensions/dna-sequencer-msoffice/src/test/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataTest.java
___________________________________________________________________
Name: svn:mime-type
+ text/plain
Modified: trunk/extensions/dna-sequencer-msoffice/src/test/resources/word.doc
===================================================================
(Binary files differ)