コードポイントを読み取っているときのオフセットの問題

再開：私は現在、ソースコードをトークンに変換するActionScript 3レクサーを作成しています。入力を解釈することを選択しました。オプションの代理ペアをクラスUStringにラップした文字列をコードポイントで表しました。フードの下では、UStringPosクラスを使用して最後の読み取り位置をキャッシュします。コードポイントを読み取っているときのオフセットの問題

それがなぜ私は...それはそれは"huehuehue", 9を記録したが、また別の話だったはず

'use strict'; 

import {Lexer}  from 'core/Lexer'; 
import {UString} from 'utils/UString'; 
import ErrorHandler from 'core/ErrorHandler'; 

const errorHandler = new ErrorHandler(true); 

// Tell the length to the `Lexer` manually. 
const lexer = new Lexer(
    new UString('huehuehue'), 9, errorHandler); 

// Scan first token 
lexer.next(); 

const id = lexer.lookahead.value; 

console.log(
    id, 
    id.length 
);

...と識別子"huehuehue"をスキャンする方法

をテストしてみました最後の'e'が欠落していますか？これをスキャンする際の最も内側の方法はLexer#getCommonIdentifierです。私は既に私のUStringの部分をテストしました。そして、それは大丈夫です。

レクサー関連定義

/* 
* Class that turns AS3 code into tokens. 
*/ 
export class Lexer 
{ 
    /* 
    * @param {UString} source 
    * @param {Number} length 
    * @param {ErrorHandler} errorHandler 
    */ 
    constructor(source, length, errorHandler) 
    { 
    this.source = source; 
    this.length = length; 
    this.index = 0; 
    this.lineStart = 0; 
    this.lineNumber = 1; 
    this.comments = []; 

    this.errorHandler = errorHandler; 

    this.previousToken = null; 
    this.token   = null; 
    this.lookahead  = null; 

    this._special = []; 
    } 

    /* 
    * Verifies the end of file. 
    */ 
    eof() 
    { 
    return this.index >= this.length; 
    } 

    /* 
    * Advance the previous, current and lookahead tokens. 
    * The lexer however does not depend on these tokens. 
    */ 
    next() 
    { 
    this.previousToken = this.token; 
    this.token   = this.lookahead; 
    this.lookahead  = this.lex(); 
    } 

    /* 
    * Consumes the next token and return it. 
    */ 
    lex() 
    { 
    this.consumeWhiteSpaces(); 

    while (this.consumeComment()) 
     this.consumeWhiteSpaces(); 

    let cp = this.source.codePointAt(this.index); 

    let pureIdentifier = 
     Character.isIdentifierStart(cp); 

    if (pureIdentifier || (cp === 0x5C)) 
     return this.scanIdentifierOrKeyword(!pureIdentifier); 

    if (this.eof()) 
    { 
     let loc = [ this.index, this.lineNumber ]; 
     return new Token(TokenType.EOF, loc, loc, '<end>'); 
    } 
    } 

    /* 
    * Scan an identifier, keyword or boolean literal. 
    */ 
    scanIdentifierOrKeyword(usingEscape) 
    { 
    const start = this.index; 
    let id; 

    /* Like Esprima does: only identifiers containing 
    * escapes need some overheads. */ 
    if (usingEscape) 
    { 
     id = this.getEscapedIdentifier(
     String.fromCodePoint(this.scanUnicodeEscapeSequence())); 
    } 
    else 
     id = this.getCommonIdentifier(); 

    return new Token(
     TokenType.IDENTIFIER, 
     [ start  , this.lineNumber ], 
     [ this.index, this.lineNumber ], 
     id 
    ); 
    } 

    /* 
    * Interprets an identifier. If any escape appears, switches to 
    * getEscapedIdentifier(). 
    */ 
    getCommonIdentifier() 
    { 
    const start = this.source.position.offset; 
    let cp = 0; 

    // Jump the starting symbol. 
    ++this.index; 

    while (!this.eof()) 
    { 
     cp = this.source.codePointAt(this.index); 

     if (Character.isIdentifierPart(cp)) 
     ++this.index; 

     // Switches to escape-minded task... 
     else if (cp === 0x5C) 
     return this.getUnicodeEscapedIdentifier(
      this.source.string.slice(
      start, this.source.position.offset 
     ) 
     ); 

     else break; 
    } 
    return this.source.string.slice(
     start, this.source.position.offset 
    ); 
    } 

    /* ... */ 
}

utilsの/ UString.js

'use strict'; 

/* 
* String wrapper with methods _based_ on code points. 
*/ 
export class UString 
{ 
    /* 
    * Constructs the {UString}. 
    * 
    * @param {String} s String to be wrapped. 
    */ 
    constructor(s) 
    { 
    /* 
    * @type {String} 
    */ 
    this.string = s; 

    /* 
    * Tracks the last accessed position. 
    * 
    * @type {UStringPos} 
    */ 
    this.position = new UStringPos(0, 0); 
    } 

    /* 
    * Reads a code point at specific index. 
    * 
    * @param {Number} index 
    * @return {Number} 
    */ 
    codePointAt(index) 
    { 
    this.position.walk(this.string, index); 
    return this.string.codePointAt(this.position.offset); 
    } 

    /* 
    * Slices the internal string by code point indices. 
    * 
    * @param {Number} i 
    * @param {Number} j 
    * @return {String} 
    */ 
    slice(i, j) 
    { 
    this.position.walk(this.string, i); 
    i = this.position.offset; 

    this.position.walk(this.string, j); 
    j = this.position.offset; 

    return this.string.slice(i, j); 
    } 
}; 

/* 
* Class that tracks the position of a code point on a string. 
*/ 
export class UStringPos 
{ 
    /* 
    * Constructs the {UStringPos}. 
    * 
    * @param {Number} index The initial index. 
    * @param {Number} offset The initial offset. 
    */ 
    constructor(index, offset) 
    { 
    /* 
    * @type {Number} 
    */ 
    this.index = index; 

    /* 
    * @type {Number} 
    */ 
    this.offset = offset; 
    } 

    /* 
    * Walks to the given index. 
    * 
    * @param {String} s 
    * @param {Number} index 
    * @note No backward. Track the previous position instead. 
    * @return {void} 
    */ 
    walk(s, index) 
    { 
    for (; this.index < index; ++this.index) 
     this.offset += (
     this._usingSurrogates(
      s.charCodeAt(this.offset) 
     ) ? 2 : 1 
    ); 
    } 

    /* 
    * @private 
    */ 
    _usingSurrogates(ch) 
    { 
    return (ch >= 0xD800) && (ch <= 0xDBFF); 
    } 
};

何か？

出典

2017-08-16 Hydro

大丈夫です。だからthis.source.position.offsetの問題でした：++this.indexを実行したとき、私のUStringPosのオフセットは更新されません。問題はスライスのものでした。私は識別子がスタート地点前のオフセットを追跡しなければならなかったので、

this.source.string.slice(
     start, this.source.position.offset 
    );

このスライスには、オフセットに基づいていました。私は自分UStringクラスのスライスを使用し、オフセットと通常のインデックスとして最後の一人として最初のパラメータを使用することができます

ソリューション

。

'use strict'; 

export class UString 
{ 
    // ... 

    /* 
    * Slices the internal string by using a pair of 
    * offset and code point indices. 
    * 
    * @param {Number} i Offset 
    * @param {Number} j 
    * @return {String} 
    */ 
    slice(i, j) 
    { 
    this.position.walk(this.string, j); 
    j = this.position.offset; 

    return this.string.slice(i, j); 
    } 

};

出典

2017-08-16 20:41:50 Hydro

コードポイントを読み取っているときのオフセットの問題

答えて

関連する問題