Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
news
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Sartika Aritonang
news
Commits
6f7afb49
Commit
6f7afb49
authored
May 29, 2020
by
Sartika Aritonang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
ba77cac7
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
1721 additions
and
0 deletions
+1721
-0
_tokenizer.py
stbi/Lib/site-packages/pip/_vendor/html5lib/_tokenizer.py
+1721
-0
No files found.
stbi/Lib/site-packages/pip/_vendor/html5lib/_tokenizer.py
0 → 100644
View file @
6f7afb49
from
__future__
import
absolute_import
,
division
,
unicode_literals
from
pip._vendor.six
import
unichr
as
chr
from
collections
import
deque
from
.constants
import
spaceCharacters
from
.constants
import
entities
from
.constants
import
asciiLetters
,
asciiUpper2Lower
from
.constants
import
digits
,
hexDigits
,
EOF
from
.constants
import
tokenTypes
,
tagTokenTypes
from
.constants
import
replacementCharacters
from
._inputstream
import
HTMLInputStream
from
._trie
import
Trie
entitiesTrie
=
Trie
(
entities
)
class
HTMLTokenizer
(
object
):
""" This class takes care of tokenizing HTML.
* self.currentToken
Holds the token that is currently being processed.
* self.state
Holds a reference to the method to be invoked... XXX
* self.stream
Points to HTMLInputStream object.
"""
def
__init__
(
self
,
stream
,
parser
=
None
,
**
kwargs
):
self
.
stream
=
HTMLInputStream
(
stream
,
**
kwargs
)
self
.
parser
=
parser
# Setup the initial tokenizer state
self
.
escapeFlag
=
False
self
.
lastFourChars
=
[]
self
.
state
=
self
.
dataState
self
.
escape
=
False
# The current token being created
self
.
currentToken
=
None
super
(
HTMLTokenizer
,
self
)
.
__init__
()
def
__iter__
(
self
):
""" This is where the magic happens.
We do our usually processing through the states and when we have a token
to return we yield the token which pauses processing until the next token
is requested.
"""
self
.
tokenQueue
=
deque
([])
# Start processing. When EOF is reached self.state will return False
# instead of True and the loop will terminate.
while
self
.
state
():
while
self
.
stream
.
errors
:
yield
{
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
self
.
stream
.
errors
.
pop
(
0
)}
while
self
.
tokenQueue
:
yield
self
.
tokenQueue
.
popleft
()
def
consumeNumberEntity
(
self
,
isHex
):
"""This function returns either U+FFFD or the character based on the
decimal or hexadecimal representation. It also discards ";" if present.
If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
"""
allowed
=
digits
radix
=
10
if
isHex
:
allowed
=
hexDigits
radix
=
16
charStack
=
[]
# Consume all the characters that are in range while making sure we
# don't hit an EOF.
c
=
self
.
stream
.
char
()
while
c
in
allowed
and
c
is
not
EOF
:
charStack
.
append
(
c
)
c
=
self
.
stream
.
char
()
# Convert the set of characters consumed to an int.
charAsInt
=
int
(
""
.
join
(
charStack
),
radix
)
# Certain characters get replaced with others
if
charAsInt
in
replacementCharacters
:
char
=
replacementCharacters
[
charAsInt
]
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"illegal-codepoint-for-numeric-entity"
,
"datavars"
:
{
"charAsInt"
:
charAsInt
}})
elif
((
0xD800
<=
charAsInt
<=
0xDFFF
)
or
(
charAsInt
>
0x10FFFF
)):
char
=
"
\uFFFD
"
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"illegal-codepoint-for-numeric-entity"
,
"datavars"
:
{
"charAsInt"
:
charAsInt
}})
else
:
# Should speed up this check somehow (e.g. move the set to a constant)
if
((
0x0001
<=
charAsInt
<=
0x0008
)
or
(
0x000E
<=
charAsInt
<=
0x001F
)
or
(
0x007F
<=
charAsInt
<=
0x009F
)
or
(
0xFDD0
<=
charAsInt
<=
0xFDEF
)
or
charAsInt
in
frozenset
([
0x000B
,
0xFFFE
,
0xFFFF
,
0x1FFFE
,
0x1FFFF
,
0x2FFFE
,
0x2FFFF
,
0x3FFFE
,
0x3FFFF
,
0x4FFFE
,
0x4FFFF
,
0x5FFFE
,
0x5FFFF
,
0x6FFFE
,
0x6FFFF
,
0x7FFFE
,
0x7FFFF
,
0x8FFFE
,
0x8FFFF
,
0x9FFFE
,
0x9FFFF
,
0xAFFFE
,
0xAFFFF
,
0xBFFFE
,
0xBFFFF
,
0xCFFFE
,
0xCFFFF
,
0xDFFFE
,
0xDFFFF
,
0xEFFFE
,
0xEFFFF
,
0xFFFFE
,
0xFFFFF
,
0x10FFFE
,
0x10FFFF
])):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"illegal-codepoint-for-numeric-entity"
,
"datavars"
:
{
"charAsInt"
:
charAsInt
}})
try
:
# Try/except needed as UCS-2 Python builds' unichar only works
# within the BMP.
char
=
chr
(
charAsInt
)
except
ValueError
:
v
=
charAsInt
-
0x10000
char
=
chr
(
0xD800
|
(
v
>>
10
))
+
chr
(
0xDC00
|
(
v
&
0x3FF
))
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
if
c
!=
";"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"numeric-entity-without-semicolon"
})
self
.
stream
.
unget
(
c
)
return
char
def
consumeEntity
(
self
,
allowedChar
=
None
,
fromAttribute
=
False
):
# Initialise to the default output for when no entity is matched
output
=
"&"
charStack
=
[
self
.
stream
.
char
()]
if
(
charStack
[
0
]
in
spaceCharacters
or
charStack
[
0
]
in
(
EOF
,
"<"
,
"&"
)
or
(
allowedChar
is
not
None
and
allowedChar
==
charStack
[
0
])):
self
.
stream
.
unget
(
charStack
[
0
])
elif
charStack
[
0
]
==
"#"
:
# Read the next character to see if it's hex or decimal
hex
=
False
charStack
.
append
(
self
.
stream
.
char
())
if
charStack
[
-
1
]
in
(
"x"
,
"X"
):
hex
=
True
charStack
.
append
(
self
.
stream
.
char
())
# charStack[-1] should be the first digit
if
(
hex
and
charStack
[
-
1
]
in
hexDigits
)
\
or
(
not
hex
and
charStack
[
-
1
]
in
digits
):
# At least one digit found, so consume the whole number
self
.
stream
.
unget
(
charStack
[
-
1
])
output
=
self
.
consumeNumberEntity
(
hex
)
else
:
# No digits found
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-numeric-entity"
})
self
.
stream
.
unget
(
charStack
.
pop
())
output
=
"&"
+
""
.
join
(
charStack
)
else
:
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
#
# Consume characters and compare to these to a substring of the
# entity names in the list until the substring no longer matches.
while
(
charStack
[
-
1
]
is
not
EOF
):
if
not
entitiesTrie
.
has_keys_with_prefix
(
""
.
join
(
charStack
)):
break
charStack
.
append
(
self
.
stream
.
char
())
# At this point we have a string that starts with some characters
# that may match an entity
# Try to find the longest entity the string will match to take care
# of ¬i for instance.
try
:
entityName
=
entitiesTrie
.
longest_prefix
(
""
.
join
(
charStack
[:
-
1
]))
entityLength
=
len
(
entityName
)
except
KeyError
:
entityName
=
None
if
entityName
is
not
None
:
if
entityName
[
-
1
]
!=
";"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"named-entity-without-semicolon"
})
if
(
entityName
[
-
1
]
!=
";"
and
fromAttribute
and
(
charStack
[
entityLength
]
in
asciiLetters
or
charStack
[
entityLength
]
in
digits
or
charStack
[
entityLength
]
==
"="
)):
self
.
stream
.
unget
(
charStack
.
pop
())
output
=
"&"
+
""
.
join
(
charStack
)
else
:
output
=
entities
[
entityName
]
self
.
stream
.
unget
(
charStack
.
pop
())
output
+=
""
.
join
(
charStack
[
entityLength
:])
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-named-entity"
})
self
.
stream
.
unget
(
charStack
.
pop
())
output
=
"&"
+
""
.
join
(
charStack
)
if
fromAttribute
:
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
output
else
:
if
output
in
spaceCharacters
:
tokenType
=
"SpaceCharacters"
else
:
tokenType
=
"Characters"
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
tokenType
],
"data"
:
output
})
def
processEntityInAttribute
(
self
,
allowedChar
):
"""This method replaces the need for "entityInAttributeValueState".
"""
self
.
consumeEntity
(
allowedChar
=
allowedChar
,
fromAttribute
=
True
)
def
emitCurrentToken
(
self
):
"""This method is a generic handler for emitting the tags. It also sets
the state to "data" because that's what's needed after a token has been
emitted.
"""
token
=
self
.
currentToken
# Add token to the queue to be yielded
if
(
token
[
"type"
]
in
tagTokenTypes
):
token
[
"name"
]
=
token
[
"name"
]
.
translate
(
asciiUpper2Lower
)
if
token
[
"type"
]
==
tokenTypes
[
"EndTag"
]:
if
token
[
"data"
]:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"attributes-in-end-tag"
})
if
token
[
"selfClosing"
]:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"self-closing-flag-on-end-tag"
})
self
.
tokenQueue
.
append
(
token
)
self
.
state
=
self
.
dataState
# Below are the various tokenizer states worked out.
def
dataState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"&"
:
self
.
state
=
self
.
entityDataState
elif
data
==
"<"
:
self
.
state
=
self
.
tagOpenState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\u0000
"
})
elif
data
is
EOF
:
# Tokenization ends.
return
False
elif
data
in
spaceCharacters
:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
# emitted separately.
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"SpaceCharacters"
],
"data"
:
data
+
self
.
stream
.
charsUntil
(
spaceCharacters
,
True
)})
# No need to update lastFourChars here, since the first space will
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else
:
chars
=
self
.
stream
.
charsUntil
((
"&"
,
"<"
,
"
\u0000
"
))
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
+
chars
})
return
True
def
entityDataState
(
self
):
self
.
consumeEntity
()
self
.
state
=
self
.
dataState
return
True
def
rcdataState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"&"
:
self
.
state
=
self
.
characterReferenceInRcdata
elif
data
==
"<"
:
self
.
state
=
self
.
rcdataLessThanSignState
elif
data
==
EOF
:
# Tokenization ends.
return
False
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
elif
data
in
spaceCharacters
:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
# emitted separately.
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"SpaceCharacters"
],
"data"
:
data
+
self
.
stream
.
charsUntil
(
spaceCharacters
,
True
)})
# No need to update lastFourChars here, since the first space will
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else
:
chars
=
self
.
stream
.
charsUntil
((
"&"
,
"<"
,
"
\u0000
"
))
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
+
chars
})
return
True
def
characterReferenceInRcdata
(
self
):
self
.
consumeEntity
()
self
.
state
=
self
.
rcdataState
return
True
def
rawtextState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"<"
:
self
.
state
=
self
.
rawtextLessThanSignState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
elif
data
==
EOF
:
# Tokenization ends.
return
False
else
:
chars
=
self
.
stream
.
charsUntil
((
"<"
,
"
\u0000
"
))
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
+
chars
})
return
True
def
scriptDataState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"<"
:
self
.
state
=
self
.
scriptDataLessThanSignState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
elif
data
==
EOF
:
# Tokenization ends.
return
False
else
:
chars
=
self
.
stream
.
charsUntil
((
"<"
,
"
\u0000
"
))
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
+
chars
})
return
True
def
plaintextState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
EOF
:
# Tokenization ends.
return
False
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
+
self
.
stream
.
charsUntil
(
"
\u0000
"
)})
return
True
def
tagOpenState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"!"
:
self
.
state
=
self
.
markupDeclarationOpenState
elif
data
==
"/"
:
self
.
state
=
self
.
closeTagOpenState
elif
data
in
asciiLetters
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"StartTag"
],
"name"
:
data
,
"data"
:
[],
"selfClosing"
:
False
,
"selfClosingAcknowledged"
:
False
}
self
.
state
=
self
.
tagNameState
elif
data
==
">"
:
# XXX In theory it could be something besides a tag name. But
# do we really care?
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-tag-name-but-got-right-bracket"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<>"
})
self
.
state
=
self
.
dataState
elif
data
==
"?"
:
# XXX In theory it could be something besides a tag name. But
# do we really care?
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-tag-name-but-got-question-mark"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
bogusCommentState
else
:
# XXX
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-tag-name"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
dataState
return
True
def
closeTagOpenState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
asciiLetters
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
data
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
tagNameState
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-closing-tag-but-got-right-bracket"
})
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-closing-tag-but-got-eof"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
})
self
.
state
=
self
.
dataState
else
:
# XXX data can be _'_...
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-closing-tag-but-got-char"
,
"datavars"
:
{
"data"
:
data
}})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
bogusCommentState
return
True
def
tagNameState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
state
=
self
.
beforeAttributeNameState
elif
data
==
">"
:
self
.
emitCurrentToken
()
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-tag-name"
})
self
.
state
=
self
.
dataState
elif
data
==
"/"
:
self
.
state
=
self
.
selfClosingStartTagState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"name"
]
+=
"
\uFFFD
"
else
:
self
.
currentToken
[
"name"
]
+=
data
# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return
True
def
rcdataLessThanSignState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"/"
:
self
.
temporaryBuffer
=
""
self
.
state
=
self
.
rcdataEndTagOpenState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
rcdataState
return
True
def
rcdataEndTagOpenState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
asciiLetters
:
self
.
temporaryBuffer
+=
data
self
.
state
=
self
.
rcdataEndTagNameState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
rcdataState
return
True
def
rcdataEndTagNameState
(
self
):
appropriate
=
self
.
currentToken
and
self
.
currentToken
[
"name"
]
.
lower
()
==
self
.
temporaryBuffer
.
lower
()
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
beforeAttributeNameState
elif
data
==
"/"
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
selfClosingStartTagState
elif
data
==
">"
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
emitCurrentToken
()
self
.
state
=
self
.
dataState
elif
data
in
asciiLetters
:
self
.
temporaryBuffer
+=
data
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
+
self
.
temporaryBuffer
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
rcdataState
return
True
def
rawtextLessThanSignState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"/"
:
self
.
temporaryBuffer
=
""
self
.
state
=
self
.
rawtextEndTagOpenState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
rawtextState
return
True
def
rawtextEndTagOpenState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
asciiLetters
:
self
.
temporaryBuffer
+=
data
self
.
state
=
self
.
rawtextEndTagNameState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
rawtextState
return
True
def
rawtextEndTagNameState
(
self
):
appropriate
=
self
.
currentToken
and
self
.
currentToken
[
"name"
]
.
lower
()
==
self
.
temporaryBuffer
.
lower
()
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
beforeAttributeNameState
elif
data
==
"/"
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
selfClosingStartTagState
elif
data
==
">"
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
emitCurrentToken
()
self
.
state
=
self
.
dataState
elif
data
in
asciiLetters
:
self
.
temporaryBuffer
+=
data
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
+
self
.
temporaryBuffer
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
rawtextState
return
True
def
scriptDataLessThanSignState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"/"
:
self
.
temporaryBuffer
=
""
self
.
state
=
self
.
scriptDataEndTagOpenState
elif
data
==
"!"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<!"
})
self
.
state
=
self
.
scriptDataEscapeStartState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataState
return
True
def
scriptDataEndTagOpenState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
asciiLetters
:
self
.
temporaryBuffer
+=
data
self
.
state
=
self
.
scriptDataEndTagNameState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataState
return
True
def
scriptDataEndTagNameState
(
self
):
appropriate
=
self
.
currentToken
and
self
.
currentToken
[
"name"
]
.
lower
()
==
self
.
temporaryBuffer
.
lower
()
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
beforeAttributeNameState
elif
data
==
"/"
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
selfClosingStartTagState
elif
data
==
">"
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
emitCurrentToken
()
self
.
state
=
self
.
dataState
elif
data
in
asciiLetters
:
self
.
temporaryBuffer
+=
data
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
+
self
.
temporaryBuffer
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataState
return
True
def
scriptDataEscapeStartState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"-"
})
self
.
state
=
self
.
scriptDataEscapeStartDashState
else
:
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataState
return
True
def
scriptDataEscapeStartDashState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"-"
})
self
.
state
=
self
.
scriptDataEscapedDashDashState
else
:
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataState
return
True
def
scriptDataEscapedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"-"
})
self
.
state
=
self
.
scriptDataEscapedDashState
elif
data
==
"<"
:
self
.
state
=
self
.
scriptDataEscapedLessThanSignState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
elif
data
==
EOF
:
self
.
state
=
self
.
dataState
else
:
chars
=
self
.
stream
.
charsUntil
((
"<"
,
"-"
,
"
\u0000
"
))
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
+
chars
})
return
True
def
scriptDataEscapedDashState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"-"
})
self
.
state
=
self
.
scriptDataEscapedDashDashState
elif
data
==
"<"
:
self
.
state
=
self
.
scriptDataEscapedLessThanSignState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
self
.
state
=
self
.
scriptDataEscapedState
elif
data
==
EOF
:
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
self
.
state
=
self
.
scriptDataEscapedState
return
True
def
scriptDataEscapedDashDashState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"-"
})
elif
data
==
"<"
:
self
.
state
=
self
.
scriptDataEscapedLessThanSignState
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
">"
})
self
.
state
=
self
.
scriptDataState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
self
.
state
=
self
.
scriptDataEscapedState
elif
data
==
EOF
:
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
self
.
state
=
self
.
scriptDataEscapedState
return
True
def
scriptDataEscapedLessThanSignState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"/"
:
self
.
temporaryBuffer
=
""
self
.
state
=
self
.
scriptDataEscapedEndTagOpenState
elif
data
in
asciiLetters
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
+
data
})
self
.
temporaryBuffer
=
data
self
.
state
=
self
.
scriptDataDoubleEscapeStartState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataEscapedState
return
True
def
scriptDataEscapedEndTagOpenState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
asciiLetters
:
self
.
temporaryBuffer
=
data
self
.
state
=
self
.
scriptDataEscapedEndTagNameState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataEscapedState
return
True
def
scriptDataEscapedEndTagNameState
(
self
):
appropriate
=
self
.
currentToken
and
self
.
currentToken
[
"name"
]
.
lower
()
==
self
.
temporaryBuffer
.
lower
()
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
beforeAttributeNameState
elif
data
==
"/"
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
state
=
self
.
selfClosingStartTagState
elif
data
==
">"
and
appropriate
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"EndTag"
],
"name"
:
self
.
temporaryBuffer
,
"data"
:
[],
"selfClosing"
:
False
}
self
.
emitCurrentToken
()
self
.
state
=
self
.
dataState
elif
data
in
asciiLetters
:
self
.
temporaryBuffer
+=
data
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"</"
+
self
.
temporaryBuffer
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataEscapedState
return
True
def
scriptDataDoubleEscapeStartState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
(
spaceCharacters
|
frozenset
((
"/"
,
">"
))):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
if
self
.
temporaryBuffer
.
lower
()
==
"script"
:
self
.
state
=
self
.
scriptDataDoubleEscapedState
else
:
self
.
state
=
self
.
scriptDataEscapedState
elif
data
in
asciiLetters
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
self
.
temporaryBuffer
+=
data
else
:
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataEscapedState
return
True
def
scriptDataDoubleEscapedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"-"
})
self
.
state
=
self
.
scriptDataDoubleEscapedDashState
elif
data
==
"<"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
})
self
.
state
=
self
.
scriptDataDoubleEscapedLessThanSignState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
elif
data
==
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-script-in-script"
})
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
return
True
def
scriptDataDoubleEscapedDashState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"-"
})
self
.
state
=
self
.
scriptDataDoubleEscapedDashDashState
elif
data
==
"<"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
})
self
.
state
=
self
.
scriptDataDoubleEscapedLessThanSignState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
self
.
state
=
self
.
scriptDataDoubleEscapedState
elif
data
==
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-script-in-script"
})
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
self
.
state
=
self
.
scriptDataDoubleEscapedState
return
True
def
scriptDataDoubleEscapedDashDashState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"-"
})
elif
data
==
"<"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"<"
})
self
.
state
=
self
.
scriptDataDoubleEscapedLessThanSignState
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
">"
})
self
.
state
=
self
.
scriptDataState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"
\uFFFD
"
})
self
.
state
=
self
.
scriptDataDoubleEscapedState
elif
data
==
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-script-in-script"
})
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
self
.
state
=
self
.
scriptDataDoubleEscapedState
return
True
def
scriptDataDoubleEscapedLessThanSignState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"/"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
"/"
})
self
.
temporaryBuffer
=
""
self
.
state
=
self
.
scriptDataDoubleEscapeEndState
else
:
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataDoubleEscapedState
return
True
def
scriptDataDoubleEscapeEndState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
(
spaceCharacters
|
frozenset
((
"/"
,
">"
))):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
if
self
.
temporaryBuffer
.
lower
()
==
"script"
:
self
.
state
=
self
.
scriptDataEscapedState
else
:
self
.
state
=
self
.
scriptDataDoubleEscapedState
elif
data
in
asciiLetters
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
self
.
temporaryBuffer
+=
data
else
:
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
scriptDataDoubleEscapedState
return
True
def
beforeAttributeNameState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
stream
.
charsUntil
(
spaceCharacters
,
True
)
elif
data
in
asciiLetters
:
self
.
currentToken
[
"data"
]
.
append
([
data
,
""
])
self
.
state
=
self
.
attributeNameState
elif
data
==
">"
:
self
.
emitCurrentToken
()
elif
data
==
"/"
:
self
.
state
=
self
.
selfClosingStartTagState
elif
data
in
(
"'"
,
'"'
,
"="
,
"<"
):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-character-in-attribute-name"
})
self
.
currentToken
[
"data"
]
.
append
([
data
,
""
])
self
.
state
=
self
.
attributeNameState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
]
.
append
([
"
\uFFFD
"
,
""
])
self
.
state
=
self
.
attributeNameState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-attribute-name-but-got-eof"
})
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
]
.
append
([
data
,
""
])
self
.
state
=
self
.
attributeNameState
return
True
def
attributeNameState
(
self
):
data
=
self
.
stream
.
char
()
leavingThisState
=
True
emitToken
=
False
if
data
==
"="
:
self
.
state
=
self
.
beforeAttributeValueState
elif
data
in
asciiLetters
:
self
.
currentToken
[
"data"
][
-
1
][
0
]
+=
data
+
\
self
.
stream
.
charsUntil
(
asciiLetters
,
True
)
leavingThisState
=
False
elif
data
==
">"
:
# XXX If we emit here the attributes are converted to a dict
# without being checked and when the code below runs we error
# because data is a dict not a list
emitToken
=
True
elif
data
in
spaceCharacters
:
self
.
state
=
self
.
afterAttributeNameState
elif
data
==
"/"
:
self
.
state
=
self
.
selfClosingStartTagState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
][
-
1
][
0
]
+=
"
\uFFFD
"
leavingThisState
=
False
elif
data
in
(
"'"
,
'"'
,
"<"
):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-character-in-attribute-name"
})
self
.
currentToken
[
"data"
][
-
1
][
0
]
+=
data
leavingThisState
=
False
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-attribute-name"
})
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
][
-
1
][
0
]
+=
data
leavingThisState
=
False
if
leavingThisState
:
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
self
.
currentToken
[
"data"
][
-
1
][
0
]
=
(
self
.
currentToken
[
"data"
][
-
1
][
0
]
.
translate
(
asciiUpper2Lower
))
for
name
,
_
in
self
.
currentToken
[
"data"
][:
-
1
]:
if
self
.
currentToken
[
"data"
][
-
1
][
0
]
==
name
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"duplicate-attribute"
})
break
# XXX Fix for above XXX
if
emitToken
:
self
.
emitCurrentToken
()
return
True
def
afterAttributeNameState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
stream
.
charsUntil
(
spaceCharacters
,
True
)
elif
data
==
"="
:
self
.
state
=
self
.
beforeAttributeValueState
elif
data
==
">"
:
self
.
emitCurrentToken
()
elif
data
in
asciiLetters
:
self
.
currentToken
[
"data"
]
.
append
([
data
,
""
])
self
.
state
=
self
.
attributeNameState
elif
data
==
"/"
:
self
.
state
=
self
.
selfClosingStartTagState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
]
.
append
([
"
\uFFFD
"
,
""
])
self
.
state
=
self
.
attributeNameState
elif
data
in
(
"'"
,
'"'
,
"<"
):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-character-after-attribute-name"
})
self
.
currentToken
[
"data"
]
.
append
([
data
,
""
])
self
.
state
=
self
.
attributeNameState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-end-of-tag-but-got-eof"
})
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
]
.
append
([
data
,
""
])
self
.
state
=
self
.
attributeNameState
return
True
def
beforeAttributeValueState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
stream
.
charsUntil
(
spaceCharacters
,
True
)
elif
data
==
"
\"
"
:
self
.
state
=
self
.
attributeValueDoubleQuotedState
elif
data
==
"&"
:
self
.
state
=
self
.
attributeValueUnQuotedState
self
.
stream
.
unget
(
data
)
elif
data
==
"'"
:
self
.
state
=
self
.
attributeValueSingleQuotedState
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-attribute-value-but-got-right-bracket"
})
self
.
emitCurrentToken
()
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
"
\uFFFD
"
self
.
state
=
self
.
attributeValueUnQuotedState
elif
data
in
(
"="
,
"<"
,
"`"
):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"equals-in-unquoted-attribute-value"
})
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
data
self
.
state
=
self
.
attributeValueUnQuotedState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-attribute-value-but-got-eof"
})
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
data
self
.
state
=
self
.
attributeValueUnQuotedState
return
True
def
attributeValueDoubleQuotedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"
\"
"
:
self
.
state
=
self
.
afterAttributeValueState
elif
data
==
"&"
:
self
.
processEntityInAttribute
(
'"'
)
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
"
\uFFFD
"
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-attribute-value-double-quote"
})
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
data
+
\
self
.
stream
.
charsUntil
((
"
\"
"
,
"&"
,
"
\u0000
"
))
return
True
def
attributeValueSingleQuotedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"'"
:
self
.
state
=
self
.
afterAttributeValueState
elif
data
==
"&"
:
self
.
processEntityInAttribute
(
"'"
)
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
"
\uFFFD
"
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-attribute-value-single-quote"
})
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
data
+
\
self
.
stream
.
charsUntil
((
"'"
,
"&"
,
"
\u0000
"
))
return
True
def
attributeValueUnQuotedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
state
=
self
.
beforeAttributeNameState
elif
data
==
"&"
:
self
.
processEntityInAttribute
(
">"
)
elif
data
==
">"
:
self
.
emitCurrentToken
()
elif
data
in
(
'"'
,
"'"
,
"="
,
"<"
,
"`"
):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-character-in-unquoted-attribute-value"
})
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
data
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
"
\uFFFD
"
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-attribute-value-no-quotes"
})
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
][
-
1
][
1
]
+=
data
+
self
.
stream
.
charsUntil
(
frozenset
((
"&"
,
">"
,
'"'
,
"'"
,
"="
,
"<"
,
"`"
,
"
\u0000
"
))
|
spaceCharacters
)
return
True
def
afterAttributeValueState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
state
=
self
.
beforeAttributeNameState
elif
data
==
">"
:
self
.
emitCurrentToken
()
elif
data
==
"/"
:
self
.
state
=
self
.
selfClosingStartTagState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-EOF-after-attribute-value"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-character-after-attribute-value"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
beforeAttributeNameState
return
True
def
selfClosingStartTagState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
">"
:
self
.
currentToken
[
"selfClosing"
]
=
True
self
.
emitCurrentToken
()
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-EOF-after-solidus-in-tag"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-character-after-solidus-in-tag"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
beforeAttributeNameState
return
True
def
bogusCommentState
(
self
):
# Make a new comment token and give it as value all the characters
# until the first > or EOF (charsUntil checks for EOF automatically)
# and emit it.
data
=
self
.
stream
.
charsUntil
(
">"
)
data
=
data
.
replace
(
"
\u0000
"
,
"
\uFFFD
"
)
self
.
tokenQueue
.
append
(
{
"type"
:
tokenTypes
[
"Comment"
],
"data"
:
data
})
# Eat the character directly after the bogus comment which is either a
# ">" or an EOF.
self
.
stream
.
char
()
self
.
state
=
self
.
dataState
return
True
def
markupDeclarationOpenState
(
self
):
charStack
=
[
self
.
stream
.
char
()]
if
charStack
[
-
1
]
==
"-"
:
charStack
.
append
(
self
.
stream
.
char
())
if
charStack
[
-
1
]
==
"-"
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"Comment"
],
"data"
:
""
}
self
.
state
=
self
.
commentStartState
return
True
elif
charStack
[
-
1
]
in
(
'd'
,
'D'
):
matched
=
True
for
expected
in
((
'o'
,
'O'
),
(
'c'
,
'C'
),
(
't'
,
'T'
),
(
'y'
,
'Y'
),
(
'p'
,
'P'
),
(
'e'
,
'E'
)):
charStack
.
append
(
self
.
stream
.
char
())
if
charStack
[
-
1
]
not
in
expected
:
matched
=
False
break
if
matched
:
self
.
currentToken
=
{
"type"
:
tokenTypes
[
"Doctype"
],
"name"
:
""
,
"publicId"
:
None
,
"systemId"
:
None
,
"correct"
:
True
}
self
.
state
=
self
.
doctypeState
return
True
elif
(
charStack
[
-
1
]
==
"["
and
self
.
parser
is
not
None
and
self
.
parser
.
tree
.
openElements
and
self
.
parser
.
tree
.
openElements
[
-
1
]
.
namespace
!=
self
.
parser
.
tree
.
defaultNamespace
):
matched
=
True
for
expected
in
[
"C"
,
"D"
,
"A"
,
"T"
,
"A"
,
"["
]:
charStack
.
append
(
self
.
stream
.
char
())
if
charStack
[
-
1
]
!=
expected
:
matched
=
False
break
if
matched
:
self
.
state
=
self
.
cdataSectionState
return
True
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-dashes-or-doctype"
})
while
charStack
:
self
.
stream
.
unget
(
charStack
.
pop
())
self
.
state
=
self
.
bogusCommentState
return
True
def
commentStartState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
state
=
self
.
commentStartDashState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
]
+=
"
\uFFFD
"
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"incorrect-comment"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-comment"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
]
+=
data
self
.
state
=
self
.
commentState
return
True
def
commentStartDashState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
state
=
self
.
commentEndState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
]
+=
"-
\uFFFD
"
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"incorrect-comment"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-comment"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
]
+=
"-"
+
data
self
.
state
=
self
.
commentState
return
True
def
commentState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
state
=
self
.
commentEndDashState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
]
+=
"
\uFFFD
"
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-comment"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
]
+=
data
+
\
self
.
stream
.
charsUntil
((
"-"
,
"
\u0000
"
))
return
True
def
commentEndDashState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"-"
:
self
.
state
=
self
.
commentEndState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
]
+=
"-
\uFFFD
"
self
.
state
=
self
.
commentState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-comment-end-dash"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
]
+=
"-"
+
data
self
.
state
=
self
.
commentState
return
True
def
commentEndState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
">"
:
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
]
+=
"--
\uFFFD
"
self
.
state
=
self
.
commentState
elif
data
==
"!"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-bang-after-double-dash-in-comment"
})
self
.
state
=
self
.
commentEndBangState
elif
data
==
"-"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-dash-after-double-dash-in-comment"
})
self
.
currentToken
[
"data"
]
+=
data
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-comment-double-dash"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
# XXX
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-comment"
})
self
.
currentToken
[
"data"
]
+=
"--"
+
data
self
.
state
=
self
.
commentState
return
True
def
commentEndBangState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
">"
:
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
==
"-"
:
self
.
currentToken
[
"data"
]
+=
"--!"
self
.
state
=
self
.
commentEndDashState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"data"
]
+=
"--!
\uFFFD
"
self
.
state
=
self
.
commentState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-comment-end-bang-state"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"data"
]
+=
"--!"
+
data
self
.
state
=
self
.
commentState
return
True
def
doctypeState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
state
=
self
.
beforeDoctypeNameState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-doctype-name-but-got-eof"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"need-space-after-doctype"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
beforeDoctypeNameState
return
True
def
beforeDoctypeNameState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
pass
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-doctype-name-but-got-right-bracket"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"name"
]
=
"
\uFFFD
"
self
.
state
=
self
.
doctypeNameState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-doctype-name-but-got-eof"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"name"
]
=
data
self
.
state
=
self
.
doctypeNameState
return
True
def
doctypeNameState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
currentToken
[
"name"
]
=
self
.
currentToken
[
"name"
]
.
translate
(
asciiUpper2Lower
)
self
.
state
=
self
.
afterDoctypeNameState
elif
data
==
">"
:
self
.
currentToken
[
"name"
]
=
self
.
currentToken
[
"name"
]
.
translate
(
asciiUpper2Lower
)
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"name"
]
+=
"
\uFFFD
"
self
.
state
=
self
.
doctypeNameState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype-name"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
currentToken
[
"name"
]
=
self
.
currentToken
[
"name"
]
.
translate
(
asciiUpper2Lower
)
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"name"
]
+=
data
return
True
def
afterDoctypeNameState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
pass
elif
data
==
">"
:
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
currentToken
[
"correct"
]
=
False
self
.
stream
.
unget
(
data
)
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
if
data
in
(
"p"
,
"P"
):
matched
=
True
for
expected
in
((
"u"
,
"U"
),
(
"b"
,
"B"
),
(
"l"
,
"L"
),
(
"i"
,
"I"
),
(
"c"
,
"C"
)):
data
=
self
.
stream
.
char
()
if
data
not
in
expected
:
matched
=
False
break
if
matched
:
self
.
state
=
self
.
afterDoctypePublicKeywordState
return
True
elif
data
in
(
"s"
,
"S"
):
matched
=
True
for
expected
in
((
"y"
,
"Y"
),
(
"s"
,
"S"
),
(
"t"
,
"T"
),
(
"e"
,
"E"
),
(
"m"
,
"M"
)):
data
=
self
.
stream
.
char
()
if
data
not
in
expected
:
matched
=
False
break
if
matched
:
self
.
state
=
self
.
afterDoctypeSystemKeywordState
return
True
# All the characters read before the current 'data' will be
# [a-zA-Z], so they're garbage in the bogus doctype and can be
# discarded; only the latest character might be '>' or EOF
# and needs to be ungetted
self
.
stream
.
unget
(
data
)
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"expected-space-or-right-bracket-in-doctype"
,
"datavars"
:
{
"data"
:
data
}})
self
.
currentToken
[
"correct"
]
=
False
self
.
state
=
self
.
bogusDoctypeState
return
True
def
afterDoctypePublicKeywordState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
state
=
self
.
beforeDoctypePublicIdentifierState
elif
data
in
(
"'"
,
'"'
):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
beforeDoctypePublicIdentifierState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
beforeDoctypePublicIdentifierState
return
True
def
beforeDoctypePublicIdentifierState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
pass
elif
data
==
"
\"
"
:
self
.
currentToken
[
"publicId"
]
=
""
self
.
state
=
self
.
doctypePublicIdentifierDoubleQuotedState
elif
data
==
"'"
:
self
.
currentToken
[
"publicId"
]
=
""
self
.
state
=
self
.
doctypePublicIdentifierSingleQuotedState
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-end-of-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
state
=
self
.
bogusDoctypeState
return
True
def
doctypePublicIdentifierDoubleQuotedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"
\"
"
:
self
.
state
=
self
.
afterDoctypePublicIdentifierState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"publicId"
]
+=
"
\uFFFD
"
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-end-of-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"publicId"
]
+=
data
return
True
def
doctypePublicIdentifierSingleQuotedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"'"
:
self
.
state
=
self
.
afterDoctypePublicIdentifierState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"publicId"
]
+=
"
\uFFFD
"
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-end-of-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"publicId"
]
+=
data
return
True
def
afterDoctypePublicIdentifierState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
state
=
self
.
betweenDoctypePublicAndSystemIdentifiersState
elif
data
==
">"
:
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
==
'"'
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
currentToken
[
"systemId"
]
=
""
self
.
state
=
self
.
doctypeSystemIdentifierDoubleQuotedState
elif
data
==
"'"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
currentToken
[
"systemId"
]
=
""
self
.
state
=
self
.
doctypeSystemIdentifierSingleQuotedState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
state
=
self
.
bogusDoctypeState
return
True
def
betweenDoctypePublicAndSystemIdentifiersState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
pass
elif
data
==
">"
:
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
==
'"'
:
self
.
currentToken
[
"systemId"
]
=
""
self
.
state
=
self
.
doctypeSystemIdentifierDoubleQuotedState
elif
data
==
"'"
:
self
.
currentToken
[
"systemId"
]
=
""
self
.
state
=
self
.
doctypeSystemIdentifierSingleQuotedState
elif
data
==
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
state
=
self
.
bogusDoctypeState
return
True
def
afterDoctypeSystemKeywordState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
self
.
state
=
self
.
beforeDoctypeSystemIdentifierState
elif
data
in
(
"'"
,
'"'
):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
beforeDoctypeSystemIdentifierState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
stream
.
unget
(
data
)
self
.
state
=
self
.
beforeDoctypeSystemIdentifierState
return
True
def
beforeDoctypeSystemIdentifierState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
pass
elif
data
==
"
\"
"
:
self
.
currentToken
[
"systemId"
]
=
""
self
.
state
=
self
.
doctypeSystemIdentifierDoubleQuotedState
elif
data
==
"'"
:
self
.
currentToken
[
"systemId"
]
=
""
self
.
state
=
self
.
doctypeSystemIdentifierSingleQuotedState
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
state
=
self
.
bogusDoctypeState
return
True
def
doctypeSystemIdentifierDoubleQuotedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"
\"
"
:
self
.
state
=
self
.
afterDoctypeSystemIdentifierState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"systemId"
]
+=
"
\uFFFD
"
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-end-of-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"systemId"
]
+=
data
return
True
def
doctypeSystemIdentifierSingleQuotedState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
"'"
:
self
.
state
=
self
.
afterDoctypeSystemIdentifierState
elif
data
==
"
\u0000
"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
self
.
currentToken
[
"systemId"
]
+=
"
\uFFFD
"
elif
data
==
">"
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-end-of-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
currentToken
[
"systemId"
]
+=
data
return
True
def
afterDoctypeSystemIdentifierState
(
self
):
data
=
self
.
stream
.
char
()
if
data
in
spaceCharacters
:
pass
elif
data
==
">"
:
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"eof-in-doctype"
})
self
.
currentToken
[
"correct"
]
=
False
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"unexpected-char-in-doctype"
})
self
.
state
=
self
.
bogusDoctypeState
return
True
def
bogusDoctypeState
(
self
):
data
=
self
.
stream
.
char
()
if
data
==
">"
:
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
elif
data
is
EOF
:
# XXX EMIT
self
.
stream
.
unget
(
data
)
self
.
tokenQueue
.
append
(
self
.
currentToken
)
self
.
state
=
self
.
dataState
else
:
pass
return
True
def
cdataSectionState
(
self
):
data
=
[]
while
True
:
data
.
append
(
self
.
stream
.
charsUntil
(
"]"
))
data
.
append
(
self
.
stream
.
charsUntil
(
">"
))
char
=
self
.
stream
.
char
()
if
char
==
EOF
:
break
else
:
assert
char
==
">"
if
data
[
-
1
][
-
2
:]
==
"]]"
:
data
[
-
1
]
=
data
[
-
1
][:
-
2
]
break
else
:
data
.
append
(
char
)
data
=
""
.
join
(
data
)
# pylint:disable=redefined-variable-type
# Deal with null here rather than in the parser
nullCount
=
data
.
count
(
"
\u0000
"
)
if
nullCount
>
0
:
for
_
in
range
(
nullCount
):
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"ParseError"
],
"data"
:
"invalid-codepoint"
})
data
=
data
.
replace
(
"
\u0000
"
,
"
\uFFFD
"
)
if
data
:
self
.
tokenQueue
.
append
({
"type"
:
tokenTypes
[
"Characters"
],
"data"
:
data
})
self
.
state
=
self
.
dataState
return
True
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment