DNA SVN: r719 - in trunk/dna-common/src: test/java/org/jboss/dna/common and 1 other directories. - dna-commits

Wednesday, 28 January 2009

Author: rhauch
Date: 2009-01-28 21:32:27 -0500 (Wed, 28 Jan 2009)
New Revision: 719

Added:
   trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacters.java
   trunk/dna-common/src/test/java/org/jboss/dna/common/xml/
   trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharactersTest.java
Log:
DNA-278 Create new utility class for determining validity of XML NCNames

Created the implementation class by using a single structure to hold for each character a
bitmask that describes the set of classifications that character belongs to.  For example,
'a' belongs to multiple character classes, so the bitmask ORs together the bitmask
of each of those character classes.  A simple array is used to store all the bitmasks,
indexed by the integer value of the character.  Then, each method (e.g., to check whether
a character is a valid NCName starting character) simply has to look up the mask and
perform a single bit operation.  (Some methods also have some shortcuts, as several of the
classes have a very large range of characters (at the upper end) that are or are not in
the character class, and these can be checked by simply comparing the integer value of the
character with the lower end of the range.)

Added: trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacters.java
===================================================================

--- trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacters.java	           
            (rev 0)
+++ trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacters.java	2009-01-29
02:32:27 UTC (rev 719)
@@ -0,0 +1,292 @@
+/*
+ * JBoss DNA (http://www.jboss.org/dna)
+ * See the COPYRIGHT.txt file distributed with this work for information
+ * regarding copyright ownership.  Some portions may be licensed
+ * to Red Hat, Inc. under one or more contributor license agreements.
+ * See the AUTHORS.txt file in the distribution for a full listing of 
+ * individual contributors.
+ *
+ * Unless otherwise indicated, all code in JBoss DNA is licensed
+ * to you under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ * 
+ * JBoss DNA is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this software; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
+ */
+package org.jboss.dna.common.xml;
+
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+
+/**
+ * A utility class for determining the validity of various XML names, per the 
+ * <a href="http://www.w3.org/TR/REC-xml/">XML 1.0
Specification</a>.
+ */
+public class XmlCharacters {
+
+    private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000
+
+    /**
+     * This implementation uses an array that captures for each character the XML
classifications.
+     * An array is used because it is a fast way of looking up each character.
+     */
+    private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS];
+
+    private static final int VALID_CHARACTER = 1;
+    private static final int CONTENT_CHARACTER = 1 <<1;
+    private static final int SPACE_CHARACTER = 1 <<2;
+    private static final int NAME_START_CHARACTER = 1<<3;
+    private static final int NAME_CHARACTER = 1<<4;
+    private static final int NCNAME_START_CHARACTER = 1<<5;
+    private static final int NCNAME_CHARACTER = 1<<6;
+    private static final int PUBID_CHARACTER = 1<<7;
+
+    static {
+
+        // ----------------
+        // Valid Characters
+        // ----------------
+        // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
[#x10000-#x10FFFF]
+        // See http://www.w3.org/TR/REC-xml/#charsets
+        MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER;
+        MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER;
+        MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER;
+        for (int i = 0x20; i <= 0xD7FF; ++i) MASKS[i] |= VALID_CHARACTER |
CONTENT_CHARACTER;
+        for (int i = 0xE000; i <= 0xFFFD; ++i) MASKS[i] |= VALID_CHARACTER |
CONTENT_CHARACTER;
+        // Last range is bigger than our character array, so we'll handle in the
'isValid' method  ...
+        // for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK |
CONTENT_CHARACTER;
+
+        // Remove the other characters that are not allowed in XML content:
+        // '<', '&', '\n', '\r', ']'
+        MASKS['<'] &= ~(CONTENT_CHARACTER);
+        MASKS['&'] &= ~(CONTENT_CHARACTER);
+        MASKS['\n'] &= ~(CONTENT_CHARACTER);
+        MASKS['\r'] &= ~(CONTENT_CHARACTER);
+        MASKS[']'] &= ~(CONTENT_CHARACTER);
+        
+        // ---------------------
+        // Whitespace Characters
+        // ---------------------
+        // [3] S ::= (#x20 | #x9 | #xD | #xA)+
+        // See http://www.w3.org/TR/REC-xml/#sec-common-syn
+        MASKS[0x20] |= SPACE_CHARACTER;
+        MASKS[0x9] |= SPACE_CHARACTER;
+        MASKS[0xA] |= SPACE_CHARACTER;
+        MASKS[0xD] |= SPACE_CHARACTER;
+
+        // ---------------------
+        // Name Start Characters
+        // ---------------------
+        // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] |
[#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
+        // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
+        // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+        // [#x10000-#xEFFFF]
+        // See http://www.w3.org/TR/REC-xml/#sec-common-syn
+        //
+        // Note that all these start characters AND characters are valid for NAME and
NCNAME
+        int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER |
NAME_CHARACTER | NCNAME_CHARACTER;
+        MASKS[':'] |= nameStartMask;
+        MASKS['_'] |= nameStartMask;
+        for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0xC0; i <= 0xD6; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0xD8; i <= 0xF6; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0xF8; i <= 0x2FF; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0x370; i <= 0x37D; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0x37F; i <= 0x1FFF; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0x200C; i <= 0x200D; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0x2070; i <= 0x218F; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0x2C00; i <= 0x2FEF; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0x3001; i <= 0xD7FF; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0xF900; i <= 0xFDCF; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0xFDF0; i <= 0xFFFD; ++i) MASKS[i] |= nameStartMask;
+        // Last range is bigger than our character array ...
+        //for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask;
+
+        // ---------------
+        // Name Characters
+        // ---------------
+        // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7
| [#x0300-#x036F] | [#x203F-#x2040]
+        // See http://www.w3.org/TR/REC-xml/#sec-common-syn
+        //
+        // Note that all these characters are valid for NAME and NCNAME
+        int nameMask = NAME_CHARACTER | NCNAME_CHARACTER;
+        MASKS['-'] |= nameMask;
+        MASKS['.'] |= nameMask;
+        MASKS[0xB7] |= nameMask;
+        for (int i = '0'; i <= '9'; ++i) MASKS[i] |= nameMask;
+        for (int i = 0x0300; i <= 0x036F; ++i) MASKS[i] |= nameStartMask;
+        for (int i = 0x203F; i <= 0x2040; ++i) MASKS[i] |= nameStartMask;
+        
+        // --------
+        // NC Names
+        // --------
+        // [4] NCName ::= NCNameStartChar NCNameChar*
+        // which is just an XML Name, minus the ":"
+        // See http://www.w3.org/TR/REC-xml-names/#ns-decl
+        // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from
':' ...
+        MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER);
+        
+        // --------------------
+        // Public ID characters
+        // --------------------
+        // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
+        MASKS[0x20] |= PUBID_CHARACTER;
+        MASKS[0xA] |= PUBID_CHARACTER;
+        MASKS[0xD] |= PUBID_CHARACTER;
+        for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= PUBID_CHARACTER;
+        for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= PUBID_CHARACTER;
+        for (int i = '0'; i <= '9'; ++i) MASKS[i] |= PUBID_CHARACTER;
+        MASKS['-'] |= PUBID_CHARACTER;
+        MASKS['\''] |= PUBID_CHARACTER;
+        MASKS['('] |= PUBID_CHARACTER;
+        MASKS[')'] |= PUBID_CHARACTER;
+        MASKS['+'] |= PUBID_CHARACTER;
+        MASKS[','] |= PUBID_CHARACTER;
+        MASKS['.'] |= PUBID_CHARACTER;
+        MASKS['/'] |= PUBID_CHARACTER;
+        MASKS[':'] |= PUBID_CHARACTER;
+        MASKS['='] |= PUBID_CHARACTER;
+        MASKS['?'] |= PUBID_CHARACTER;
+        MASKS[';'] |= PUBID_CHARACTER;
+        MASKS['!'] |= PUBID_CHARACTER;
+        MASKS['*'] |= PUBID_CHARACTER;
+        MASKS['#'] |= PUBID_CHARACTER;
+        MASKS['@'] |= PUBID_CHARACTER;
+        MASKS['$'] |= PUBID_CHARACTER;
+        MASKS['_'] |= PUBID_CHARACTER;
+        MASKS['%'] |= PUBID_CHARACTER;
+
+    }
+
+    private XmlCharacters() {
+    }
+
+    /**
+     * Determine whether the supplied character is a valid first character in an XML
Name.
+     * The first character in an XML name is more restrictive than the {@link
#isValidName(int) remaining characters}.
+     * 
+     * @param c the character
+     * @return true if the character is valid for an XML Name's first character
+     */
+    public static boolean isValidNameStart( int c ) {
+        return c < NUMBER_OF_CHARACTERS && ( MASKS[c] &
NAME_START_CHARACTER ) != 0;
+    }
+
+    /**
+     * Determine whether the supplied character is a valid first character in an XML
NCName.
+     * The first character in an XML NCName is more restrictive than the {@link
#isValidName(int) remaining characters}.
+     * 
+     * @param c the character
+     * @return true if the character is valid for an XML NCName's first character
+     */
+    public static boolean isValidNcNameStart( int c ) {
+        return c < NUMBER_OF_CHARACTERS && ( MASKS[c] &
NCNAME_START_CHARACTER ) != 0;
+    }
+
+    /**
+     * Determine whether the supplied character is a valid non-first character in an XML
Name.
+     * The {@link #isValidNameStart(int) first character} in an XML name is more
restrictive than the remaining characters.
+     * 
+     * @param c the character
+     * @return true if the character is valid character in an XML Name
+     */
+    public static boolean isValidName( int c ) {
+        return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_CHARACTER )
!= 0;
+    }
+
+    /**
+     * Determine whether the supplied character is a valid non-first character in an XML
NCName.
+     * The {@link #isValidNcNameStart(int) first character} in an XML NCName is more
restrictive than the remaining characters.
+     * 
+     * @param c the character
+     * @return true if the character is valid character in an XML NCName
+     */
+    public static boolean isValidNcName( int c ) {
+        return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_CHARACTER )
!= 0;
+    }
+
+    /**
+     * Determine whether the supplied character is a valid character in an XML Pubid.
+     * 
+     * @param c the character
+     * @return true if the character is valid character in an XML Pubid
+     */
+    public static boolean isValidPubid( int c ) {
+        return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & PUBID_CHARACTER )
!= 0;
+    }
+
+    /**
+     * Determine whether the supplied character is a valid character in XML.
+     * 
+     * @param c the character
+     * @return true if the character is valid character in XML
+     */
+    public static boolean isValid( int c ) {
+        return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & VALID_CHARACTER )
!= 0) || ( 0x10000 <= c && c <= 0x10FFFF);
+    }
+
+    /**
+     * Determine whether the supplied character is a valid character in XML content
+     * 
+     * @param c the character
+     * @return true if the character is valid character in XML content
+     */
+    public static boolean isValidContent( int c ) {
+        return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & CONTENT_CHARACTER
) != 0) || ( 0x10000 <= c && c <= 0x10FFFF);
+    }
+
+    /**
+     * Determine whether the supplied character is a valid whitespace character in XML
+     * 
+     * @param c the character
+     * @return true if the character is valid whitespace character in XML
+     */
+    public static boolean isValidSpace( int c ) {
+        return c <= 0x20 && ( MASKS[c] & SPACE_CHARACTER ) != 0;
+    }
+    
+    /**
+     * Determine if the supplied name is a valid XML Name.
+     * 
+     * @param name the string being checked
+     * @return true if the supplied name is indeed a valid XML Name, or false otherwise
+     */
+    public static boolean isValidName( String name ) {
+        if ( name == null || name.length() == 0 ) return false;
+        CharacterIterator iter = new StringCharacterIterator(name);
+        char c = iter.first();
+        if ( !isValidNameStart(c) ) return false;
+        while ( c != CharacterIterator.DONE ) {
+            if ( !isValidName(c) ) return false;
+            c = iter.next();
+        }
+        return true;
+    }
+    
+    /**
+     * Determine if the supplied name is a valid XML NCName.
+     * 
+     * @param name the string being checked
+     * @return true if the supplied name is indeed a valid XML NCName, or false
otherwise
+     */
+    public static boolean isValidNcName( String name ) {
+        if ( name == null || name.length() == 0 ) return false;
+        CharacterIterator iter = new StringCharacterIterator(name);
+        char c = iter.first();
+        if ( !isValidNcNameStart(c) ) return false;
+        while ( c != CharacterIterator.DONE ) {
+            if ( !isValidNcName(c) ) return false;
+            c = iter.next();
+        }
+        return true;
+    }
+}


Property changes on:
trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacters.java
___________________________________________________________________
Name: svn:mime-type
   + text/plain

Added: trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharactersTest.java
===================================================================
--- trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharactersTest.java	       
                (rev 0)
+++
trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharactersTest.java	2009-01-29
02:32:27 UTC (rev 719)
@@ -0,0 +1,77 @@
+/*
+ * JBoss DNA (http://www.jboss.org/dna)
+ * See the COPYRIGHT.txt file distributed with this work for information
+ * regarding copyright ownership.  Some portions may be licensed
+ * to Red Hat, Inc. under one or more contributor license agreements.
+ * See the AUTHORS.txt file in the distribution for a full listing of 
+ * individual contributors.
+ *
+ * Unless otherwise indicated, all code in JBoss DNA is licensed
+ * to you under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ * 
+ * JBoss DNA is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this software; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
+ */
+package org.jboss.dna.common.xml;
+
+import static org.hamcrest.core.Is.is;
+import static org.junit.Assert.assertThat;
+import org.junit.Test;
+
+/**
+ * 
+ */
+public class XmlCharactersTest {
+
+    @Test
+    public void shouldNotAllowColonInNcName() {
+        assertThat(XmlCharacters.isValidNcNameStart(':'), is(false));
+    }
+
+    @Test
+    public void shouldNotAllowDigitAsFirstCharacterInName() {
+        assertThat(XmlCharacters.isValidNameStart('0'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('1'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('2'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('3'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('4'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('5'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('6'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('7'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('8'), is(false));
+        assertThat(XmlCharacters.isValidNameStart('9'), is(false));
+    }
+
+    @Test
+    public void shouldAllowLettersAsFirstCharacterInName() {
+        for (char c = 'a'; c <= 'z'; ++c) {
+            assertThat(XmlCharacters.isValidNameStart(c), is(true));
+        }
+        for (char c = 'A'; c <= 'Z'; ++c) {
+            assertThat(XmlCharacters.isValidNameStart(c), is(true));
+        }
+    }
+
+    @Test
+    public void shouldNotAllowDigitAsFirstCharacterInNcName() {
+        assertThat(XmlCharacters.isValidNcNameStart('0'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('1'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('2'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('3'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('4'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('5'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('6'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('7'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('8'), is(false));
+        assertThat(XmlCharacters.isValidNcNameStart('9'), is(false));
+    }
+}


Property changes on:
trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharactersTest.java
___________________________________________________________________
Name: svn:mime-type
   + text/plain


    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

DNA SVN: r719 - in trunk/dna-common/src: test/java/org/jboss/dna/common and 1 other directories.