程序人生 2010-02-25 15:55:04 阅读335 评论0 字号:大中小 订阅
String TextResourceDecoder::decode(const char* data, size_t len)
{
if (!m_checkedForBOM)
checkForBOM(data, len); // 检查是否为Unicode编码
bool movedDataToBuffer = false;
if (m_contentType == CSS && !m_checkedForCSSCharset)
if (!checkForCSSCharset(data, len, movedDataToBuffer)) // 如果是CSS,则检查CSS的字符集
return "";
if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
if (!checkForHeadCharset(data, len, movedDataToBuffer)) // 检查HTML/XML的字符集
return "";
// Do the auto-detect if our default encoding is one of the Japanese ones.
// FIXME: It seems wrong to change our encoding downstream after we have already done some decoding.
if (m_source != UserChosenEncoding && m_source != AutoDetectedEncoding && encoding().isJapanese())
detectJapaneseEncoding(data, len); // 检查日文编码(为什么没有检查中文编码的啊?)
ASSERT(encoding().isValid());
if (m_buffer.isEmpty())
return m_decoder.decode(data, len, false, m_contentType == XML, m_sawError);
if (!movedDataToBuffer) {
size_t oldSize = m_buffer.size();
m_buffer.grow(oldSize + len);
memcpy(m_buffer.data() + oldSize, data, len);
}
String result = m_decoder.decode(m_buffer.data(), m_buffer.size(), false, m_contentType == XML, m_sawError);
m_buffer.clear();
return result;
}
再回到tokenizer->write(decoded, true);看其具体实现:
bool HTMLTokenizer::write(const SegmentedString& str, bool appendData)
{
if (!m_buffer)
return false;
if (m_parserStopped)
return false;
SegmentedString source(str);
if (m_executingScript)
source.setExcludeLineNumbers();
if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) {
// don't parse; we will do this later
if (m_currentPrependingSrc)
m_currentPrependingSrc->append(source);
else {
m_pendingSrc.append(source);
#if PRELOAD_SCANNER_ENABLED
if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
m_preloadScanner->write(source);
#endif
}
return false;
}
#if PRELOAD_SCANNER_ENABLED
if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
m_preloadScanner->end();
#endif
if (!m_src.isEmpty())
m_src.append(source);
else
setSrc(source);
// Once a timer is set, it has control of when the tokenizer continues.
if (m_timer.isActive())
return false;
bool wasInWrite = m_inWrite;
m_inWrite = true;
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!m_doc->ownerElement())
printf("Beginning write at time %d ", m_doc->elapsedTime());
#endif
int processedCount = 0;
double startTime = currentTime();
Frame* frame = m_doc->frame();
State state = m_state;
while (!m_src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {
if (!continueProcessing(processedCount, startTime, state))
break;
// do we need to enlarge the buffer?
checkBuffer();
UChar cc = *m_src;
bool wasSkipLF = state.skipLF();
if (wasSkipLF)
state.setSkipLF(false);
if (wasSkipLF && (cc == ' '))
m_src.advance();
else if (state.needsSpecialWriteHandling()) {
// it's important to keep needsSpecialWriteHandling with the flags this block tests
if (state.hasEntityState())
state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState());
else if (state.inPlainText())
state = parseText(m_src, state);
else if (state.inAnySpecial())
state = parseSpecial(m_src, state);
else if (state.inComment())
state = parseComment(m_src, state);
else if (state.inDoctype())
state = parseDoctype(m_src, state);
else if (state.inServer())
state = parseServer(m_src, state);
else if (state.inProcessingInstruction())
state = parseProcessingInstruction(m_src, state);
else if (state.hasTagState())
state = parseTag(m_src, state);
else if (state.startTag()) {
state.setStartTag(false);
switch(cc) {
case '/':
break;
case '!': {
// or
searchCount = 1; // Look for ' m_doctypeSearchCount = 1;
break;
}
case '?': {
// xml processing instruction
state.setInProcessingInstruction(true);
tquote = NoQuote;
state = parseProcessingInstruction(m_src, state);
continue;
break;
}
case '%':
if (!m_brokenServer) {
// <% server stuff, handle as comment %>
state.setInServer(true);
tquote = NoQuote;
state = parseServer(m_src, state);
continue;
}
// else fall through
default: {
if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
// Start of a Start-Tag
} else {
// Invalid tag
// Add as is
*m_dest = '<';
m_dest++;
continue;
}
}
}; // end case
processToken();
m_cBufferPos = 0;
state.setTagState(TagName);
state = parseTag(m_src, state);
}
} else if (cc == '&' && !m_src.escaped()) {
m_src.advancePastNonNewline();
state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState());
} else if (cc == '<' && !m_src.escaped()) {
m_currentTagStartLineNumber = m_lineNumber;
m_src.advancePastNonNewline();
state.setStartTag(true);
state.setDiscardLF(false);
} else if (cc == ' ' || cc == ' ') {
if (state.discardLF())
// Ignore this LF
state.setDiscardLF(false); // We have discarded 1 LF
else {
// Process this LF
*m_dest++ = ' ';
if (cc == ' ' && !m_src.excludeLineNumbers())
m_lineNumber++;
}
/* Check for MS-DOS CRLF sequence */
if (cc == ' ')
state.setSkipLF(true);
m_src.advance(m_lineNumber);
} else {
state.setDiscardLF(false);
*m_dest++ = cc;
m_src.advancePastNonNewline();
}
}
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!m_doc->ownerElement())
printf("Ending write at time %d ", m_doc->elapsedTime());
#endif
m_inWrite = wasInWrite;
m_state = state;
if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {
end(); // this actually causes us to be deleted
return true;
}
return false;
}
在调用的时候,因为调用参数decoded是String类型的,所以先隐含转化成SegmentedString。SegmentedString可以附带行号,也可以不带行号(可以设定)。上面程序中的while循环主体,就是一个分析程序主体。