From dna-commits at lists.jboss.org Wed Jan 28 21:32:28 2009 Content-Type: multipart/mixed; boundary="===============6779258157755714304==" MIME-Version: 1.0 From: dna-commits at lists.jboss.org To: dna-commits at lists.jboss.org Subject: [dna-commits] DNA SVN: r719 - in trunk/dna-common/src: test/java/org/jboss/dna/common and 1 other directories. Date: Wed, 28 Jan 2009 21:32:28 -0500 Message-ID: --===============6779258157755714304== Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Author: rhauch Date: 2009-01-28 21:32:27 -0500 (Wed, 28 Jan 2009) New Revision: 719 Added: trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacters.ja= va trunk/dna-common/src/test/java/org/jboss/dna/common/xml/ trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharactersTes= t.java Log: DNA-278 Create new utility class for determining validity of XML NCNames Created the implementation class by using a single structure to hold for ea= ch character a bitmask that describes the set of classifications that chara= cter belongs to. For example, 'a' belongs to multiple character classes, s= o the bitmask ORs together the bitmask of each of those character classes. = A simple array is used to store all the bitmasks, indexed by the integer v= alue of the character. Then, each method (e.g., to check whether a charact= er is a valid NCName starting character) simply has to look up the mask and= perform a single bit operation. (Some methods also have some shortcuts, a= s several of the classes have a very large range of characters (at the uppe= r end) that are or are not in the character class, and these can be checked= by simply comparing the integer value of the character with the lower end = of the range.) Added: trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacter= s.java =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacters.j= ava (rev 0) +++ trunk/dna-common/src/main/java/org/jboss/dna/common/xml/XmlCharacters.j= ava 2009-01-29 02:32:27 UTC (rev 719) @@ -0,0 +1,292 @@ +/* + * JBoss DNA (http://www.jboss.org/dna) + * See the COPYRIGHT.txt file distributed with this work for information + * regarding copyright ownership. Some portions may be licensed + * to Red Hat, Inc. under one or more contributor license agreements. + * See the AUTHORS.txt file in the distribution for a full listing of = + * individual contributors. + * + * Unless otherwise indicated, all code in JBoss DNA is licensed + * to you under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * = + * JBoss DNA is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this software; if not, write to the Free + * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA, or see the FSF site: http://www.fsf.org. + */ +package org.jboss.dna.common.xml; + +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; + +/** + * A utility class for determining the validity of various XML names, per = the = + * XML 1.0 Specification. + */ +public class XmlCharacters { + + private static final int NUMBER_OF_CHARACTERS =3D 1 << 16; // 65536 or= 0x10000 + + /** + * This implementation uses an array that captures for each character = the XML classifications. + * An array is used because it is a fast way of looking up each charac= ter. + */ + private static final char[] MASKS =3D new char[NUMBER_OF_CHARACTERS]; + + private static final int VALID_CHARACTER =3D 1; + private static final int CONTENT_CHARACTER =3D 1 <<1; + private static final int SPACE_CHARACTER =3D 1 <<2; + private static final int NAME_START_CHARACTER =3D 1<<3; + private static final int NAME_CHARACTER =3D 1<<4; + private static final int NCNAME_START_CHARACTER =3D 1<<5; + private static final int NCNAME_CHARACTER =3D 1<<6; + private static final int PUBID_CHARACTER =3D 1<<7; + + static { + + // ---------------- + // Valid Characters + // ---------------- + // [2] Char ::=3D #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD= ] | [#x10000-#x10FFFF] + // See http://www.w3.org/TR/REC-xml/#charsets + MASKS[0x9] |=3D VALID_CHARACTER | CONTENT_CHARACTER; + MASKS[0xA] |=3D VALID_CHARACTER | CONTENT_CHARACTER; + MASKS[0xD] |=3D VALID_CHARACTER | CONTENT_CHARACTER; + for (int i =3D 0x20; i <=3D 0xD7FF; ++i) MASKS[i] |=3D VALID_CHARA= CTER | CONTENT_CHARACTER; + for (int i =3D 0xE000; i <=3D 0xFFFD; ++i) MASKS[i] |=3D VALID_CHA= RACTER | CONTENT_CHARACTER; + // Last range is bigger than our character array, so we'll handle = in the 'isValid' method ... + // for ( int i=3D0x10000; i<=3D0x10FFFF; ++i ) MASKS[i] =3D VALID_= CHARACTER_MASK | CONTENT_CHARACTER; + + // Remove the other characters that are not allowed in XML content: + // '<', '&', '\n', '\r', ']' + MASKS['<'] &=3D ~(CONTENT_CHARACTER); + MASKS['&'] &=3D ~(CONTENT_CHARACTER); + MASKS['\n'] &=3D ~(CONTENT_CHARACTER); + MASKS['\r'] &=3D ~(CONTENT_CHARACTER); + MASKS[']'] &=3D ~(CONTENT_CHARACTER); + = + // --------------------- + // Whitespace Characters + // --------------------- + // [3] S ::=3D (#x20 | #x9 | #xD | #xA)+ + // See http://www.w3.org/TR/REC-xml/#sec-common-syn + MASKS[0x20] |=3D SPACE_CHARACTER; + MASKS[0x9] |=3D SPACE_CHARACTER; + MASKS[0xA] |=3D SPACE_CHARACTER; + MASKS[0xD] |=3D SPACE_CHARACTER; + + // --------------------- + // Name Start Characters + // --------------------- + // [4] NameStartChar ::=3D ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]= | [#xD8-#xF6] | [#xF8-#x2FF] | + // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x21= 8F] | + // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#= xFFFD] | + // [#x10000-#xEFFFF] + // See http://www.w3.org/TR/REC-xml/#sec-common-syn + // + // Note that all these start characters AND characters are valid f= or NAME and NCNAME + int nameStartMask =3D NAME_START_CHARACTER | NCNAME_START_CHARACTE= R | NAME_CHARACTER | NCNAME_CHARACTER; + MASKS[':'] |=3D nameStartMask; + MASKS['_'] |=3D nameStartMask; + for (int i =3D 'A'; i <=3D 'Z'; ++i) MASKS[i] |=3D nameStartMask; + for (int i =3D 'a'; i <=3D 'z'; ++i) MASKS[i] |=3D nameStartMask; + for (int i =3D 0xC0; i <=3D 0xD6; ++i) MASKS[i] |=3D nameStartMask; + for (int i =3D 0xD8; i <=3D 0xF6; ++i) MASKS[i] |=3D nameStartMask; + for (int i =3D 0xF8; i <=3D 0x2FF; ++i) MASKS[i] |=3D nameStartMas= k; + for (int i =3D 0x370; i <=3D 0x37D; ++i) MASKS[i] |=3D nameStartMa= sk; + for (int i =3D 0x37F; i <=3D 0x1FFF; ++i) MASKS[i] |=3D nameStartM= ask; + for (int i =3D 0x200C; i <=3D 0x200D; ++i) MASKS[i] |=3D nameStart= Mask; + for (int i =3D 0x2070; i <=3D 0x218F; ++i) MASKS[i] |=3D nameStart= Mask; + for (int i =3D 0x2C00; i <=3D 0x2FEF; ++i) MASKS[i] |=3D nameStart= Mask; + for (int i =3D 0x3001; i <=3D 0xD7FF; ++i) MASKS[i] |=3D nameStart= Mask; + for (int i =3D 0xF900; i <=3D 0xFDCF; ++i) MASKS[i] |=3D nameStart= Mask; + for (int i =3D 0xFDF0; i <=3D 0xFFFD; ++i) MASKS[i] |=3D nameStart= Mask; + // Last range is bigger than our character array ... + //for (int i =3D 0x10000; i <=3D 0xEFFFF; ++i) MASKS[i] |=3D nameS= tartMask; + + // --------------- + // Name Characters + // --------------- + // [4a] NameChar ::=3D NameStartChar | "-" | "." | [0-9] | #xB7 | = [#x0300-#x036F] | [#x203F-#x2040] + // See http://www.w3.org/TR/REC-xml/#sec-common-syn + // + // Note that all these characters are valid for NAME and NCNAME + int nameMask =3D NAME_CHARACTER | NCNAME_CHARACTER; + MASKS['-'] |=3D nameMask; + MASKS['.'] |=3D nameMask; + MASKS[0xB7] |=3D nameMask; + for (int i =3D '0'; i <=3D '9'; ++i) MASKS[i] |=3D nameMask; + for (int i =3D 0x0300; i <=3D 0x036F; ++i) MASKS[i] |=3D nameStart= Mask; + for (int i =3D 0x203F; i <=3D 0x2040; ++i) MASKS[i] |=3D nameStart= Mask; + = + // -------- + // NC Names + // -------- + // [4] NCName ::=3D NCNameStartChar NCNameChar* + // which is just an XML Name, minus the ":" + // See http://www.w3.org/TR/REC-xml-names/#ns-decl + // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER mask= s from ':' ... + MASKS[':'] &=3D ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER); + = + // -------------------- + // Public ID characters + // -------------------- + // [13] PubidChar ::=3D #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./= :=3D?;!*#@$_%] + MASKS[0x20] |=3D PUBID_CHARACTER; + MASKS[0xA] |=3D PUBID_CHARACTER; + MASKS[0xD] |=3D PUBID_CHARACTER; + for (int i =3D 'A'; i <=3D 'Z'; ++i) MASKS[i] |=3D PUBID_CHARACTER; + for (int i =3D 'a'; i <=3D 'z'; ++i) MASKS[i] |=3D PUBID_CHARACTER; + for (int i =3D '0'; i <=3D '9'; ++i) MASKS[i] |=3D PUBID_CHARACTER; + MASKS['-'] |=3D PUBID_CHARACTER; + MASKS['\''] |=3D PUBID_CHARACTER; + MASKS['('] |=3D PUBID_CHARACTER; + MASKS[')'] |=3D PUBID_CHARACTER; + MASKS['+'] |=3D PUBID_CHARACTER; + MASKS[','] |=3D PUBID_CHARACTER; + MASKS['.'] |=3D PUBID_CHARACTER; + MASKS['/'] |=3D PUBID_CHARACTER; + MASKS[':'] |=3D PUBID_CHARACTER; + MASKS['=3D'] |=3D PUBID_CHARACTER; + MASKS['?'] |=3D PUBID_CHARACTER; + MASKS[';'] |=3D PUBID_CHARACTER; + MASKS['!'] |=3D PUBID_CHARACTER; + MASKS['*'] |=3D PUBID_CHARACTER; + MASKS['#'] |=3D PUBID_CHARACTER; + MASKS['@'] |=3D PUBID_CHARACTER; + MASKS['$'] |=3D PUBID_CHARACTER; + MASKS['_'] |=3D PUBID_CHARACTER; + MASKS['%'] |=3D PUBID_CHARACTER; + + } + + private XmlCharacters() { + } + + /** + * Determine whether the supplied character is a valid first character= in an XML Name. + * The first character in an XML name is more restrictive than the {@l= ink #isValidName(int) remaining characters}. + * = + * @param c the character + * @return true if the character is valid for an XML Name's first char= acter + */ + public static boolean isValidNameStart( int c ) { + return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_START_CHARACT= ER ) !=3D 0; + } + + /** + * Determine whether the supplied character is a valid first character= in an XML NCName. + * The first character in an XML NCName is more restrictive than the {= @link #isValidName(int) remaining characters}. + * = + * @param c the character + * @return true if the character is valid for an XML NCName's first ch= aracter + */ + public static boolean isValidNcNameStart( int c ) { + return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_START_CHARA= CTER ) !=3D 0; + } + + /** + * Determine whether the supplied character is a valid non-first chara= cter in an XML Name. + * The {@link #isValidNameStart(int) first character} in an XML name i= s more restrictive than the remaining characters. + * = + * @param c the character + * @return true if the character is valid character in an XML Name + */ + public static boolean isValidName( int c ) { + return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_CHARACTER ) != =3D 0; + } + + /** + * Determine whether the supplied character is a valid non-first chara= cter in an XML NCName. + * The {@link #isValidNcNameStart(int) first character} in an XML NCNa= me is more restrictive than the remaining characters. + * = + * @param c the character + * @return true if the character is valid character in an XML NCName + */ + public static boolean isValidNcName( int c ) { + return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_CHARACTER )= !=3D 0; + } + + /** + * Determine whether the supplied character is a valid character in an= XML Pubid. + * = + * @param c the character + * @return true if the character is valid character in an XML Pubid + */ + public static boolean isValidPubid( int c ) { + return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & PUBID_CHARACTER ) = !=3D 0; + } + + /** + * Determine whether the supplied character is a valid character in XM= L. + * = + * @param c the character + * @return true if the character is valid character in XML + */ + public static boolean isValid( int c ) { + return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & VALID_CHARACTER )= !=3D 0) || ( 0x10000 <=3D c && c <=3D 0x10FFFF); + } + + /** + * Determine whether the supplied character is a valid character in XM= L content + * = + * @param c the character + * @return true if the character is valid character in XML content + */ + public static boolean isValidContent( int c ) { + return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & CONTENT_CHARACTER= ) !=3D 0) || ( 0x10000 <=3D c && c <=3D 0x10FFFF); + } + + /** + * Determine whether the supplied character is a valid whitespace char= acter in XML + * = + * @param c the character + * @return true if the character is valid whitespace character in XML + */ + public static boolean isValidSpace( int c ) { + return c <=3D 0x20 && ( MASKS[c] & SPACE_CHARACTER ) !=3D 0; + } + = + /** + * Determine if the supplied name is a valid XML Name. + * = + * @param name the string being checked + * @return true if the supplied name is indeed a valid XML Name, or fa= lse otherwise + */ + public static boolean isValidName( String name ) { + if ( name =3D=3D null || name.length() =3D=3D 0 ) return false; + CharacterIterator iter =3D new StringCharacterIterator(name); + char c =3D iter.first(); + if ( !isValidNameStart(c) ) return false; + while ( c !=3D CharacterIterator.DONE ) { + if ( !isValidName(c) ) return false; + c =3D iter.next(); + } + return true; + } + = + /** + * Determine if the supplied name is a valid XML NCName. + * = + * @param name the string being checked + * @return true if the supplied name is indeed a valid XML NCName, or = false otherwise + */ + public static boolean isValidNcName( String name ) { + if ( name =3D=3D null || name.length() =3D=3D 0 ) return false; + CharacterIterator iter =3D new StringCharacterIterator(name); + char c =3D iter.first(); + if ( !isValidNcNameStart(c) ) return false; + while ( c !=3D CharacterIterator.DONE ) { + if ( !isValidNcName(c) ) return false; + c =3D iter.next(); + } + return true; + } +} Property changes on: trunk/dna-common/src/main/java/org/jboss/dna/common/xm= l/XmlCharacters.java ___________________________________________________________________ Name: svn:mime-type + text/plain Added: trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharacter= sTest.java =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharactersTe= st.java (rev 0) +++ trunk/dna-common/src/test/java/org/jboss/dna/common/xml/XmlCharactersTe= st.java 2009-01-29 02:32:27 UTC (rev 719) @@ -0,0 +1,77 @@ +/* + * JBoss DNA (http://www.jboss.org/dna) + * See the COPYRIGHT.txt file distributed with this work for information + * regarding copyright ownership. Some portions may be licensed + * to Red Hat, Inc. under one or more contributor license agreements. + * See the AUTHORS.txt file in the distribution for a full listing of = + * individual contributors. + * + * Unless otherwise indicated, all code in JBoss DNA is licensed + * to you under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * = + * JBoss DNA is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this software; if not, write to the Free + * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA, or see the FSF site: http://www.fsf.org. + */ +package org.jboss.dna.common.xml; + +import static org.hamcrest.core.Is.is; +import static org.junit.Assert.assertThat; +import org.junit.Test; + +/** + * = + */ +public class XmlCharactersTest { + + @Test + public void shouldNotAllowColonInNcName() { + assertThat(XmlCharacters.isValidNcNameStart(':'), is(false)); + } + + @Test + public void shouldNotAllowDigitAsFirstCharacterInName() { + assertThat(XmlCharacters.isValidNameStart('0'), is(false)); + assertThat(XmlCharacters.isValidNameStart('1'), is(false)); + assertThat(XmlCharacters.isValidNameStart('2'), is(false)); + assertThat(XmlCharacters.isValidNameStart('3'), is(false)); + assertThat(XmlCharacters.isValidNameStart('4'), is(false)); + assertThat(XmlCharacters.isValidNameStart('5'), is(false)); + assertThat(XmlCharacters.isValidNameStart('6'), is(false)); + assertThat(XmlCharacters.isValidNameStart('7'), is(false)); + assertThat(XmlCharacters.isValidNameStart('8'), is(false)); + assertThat(XmlCharacters.isValidNameStart('9'), is(false)); + } + + @Test + public void shouldAllowLettersAsFirstCharacterInName() { + for (char c =3D 'a'; c <=3D 'z'; ++c) { + assertThat(XmlCharacters.isValidNameStart(c), is(true)); + } + for (char c =3D 'A'; c <=3D 'Z'; ++c) { + assertThat(XmlCharacters.isValidNameStart(c), is(true)); + } + } + + @Test + public void shouldNotAllowDigitAsFirstCharacterInNcName() { + assertThat(XmlCharacters.isValidNcNameStart('0'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('1'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('2'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('3'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('4'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('5'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('6'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('7'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('8'), is(false)); + assertThat(XmlCharacters.isValidNcNameStart('9'), is(false)); + } +} Property changes on: trunk/dna-common/src/test/java/org/jboss/dna/common/xm= l/XmlCharactersTest.java ___________________________________________________________________ Name: svn:mime-type + text/plain --===============6779258157755714304==--