
I have a language that is line based. I would like to recognize certain words as "keywords" only if they are on certain lines...
For instance, I have a line that starts "#setup" that I want to recognize the following:
#setup letter landscape regular 8 margin commas scale 1000 unlimited
I want to highlight #setup, letter, ..., except 8 and 1000 as keywords.
However, letter is only a keyword when it is on a line starting with #setup.
It seems to me that using lexical states is the correct way to handle this, so I tried to use the following code (sorry, it's long):The "lexicalState" that is passed to this function the next time around, unfortunately, doesn't seem to be the state I told it to use in my "lexicalParseData" value. Any idea why? What am I doing wrong?
For instance, I have a line that starts "#setup" that I want to recognize the following:
#setup letter landscape regular 8 margin commas scale 1000 unlimited
I want to highlight #setup, letter, ..., except 8 and 1000 as keywords.
However, letter is only a keyword when it is on a line starting with #setup.
It seems to me that using lexical states is the correct way to handle this, so I tried to use the following code (sorry, it's long):
public class ReportLexer : IMergableLexicalParser
{
static Dictionary<int, DefaultLexicalState> states;
static Dictionary<string, int> setupKeywords;
static Dictionary<string, int> pagesizeKeywords;
static Dictionary<string, int> colHeadKeywords;
static Dictionary<string, int> variableLineKeywords;
static int MakeStateId(int previous, int current)
{
return (previous << 8) | current;
}
static void AddStatePair(int previous, int current)
{
int id = MakeStateId(previous, current);
states.Add(id, new DefaultLexicalState(id, id.ToString("x8")));
}
public static IEnumerable<ILexicalState> LexicalStates
{
get
{
foreach (KeyValuePair<int, DefaultLexicalState> kvp in states)
yield return kvp.Value;
}
}
static ReportLexer() {
states = new Dictionary<int, DefaultLexicalState>();
setupKeywords = new Dictionary<string, int>();
pagesizeKeywords = new Dictionary<string, int>();
colHeadKeywords = new Dictionary<string, int>();
variableLineKeywords = new Dictionary<string, int>();
setupKeywords.Add("letter", ReportTokenID.SetupLetter);
setupKeywords.Add("portrait", ReportTokenID.SetupPortrait);
setupKeywords.Add("landscape", ReportTokenID.SetupLandscape);
setupKeywords.Add("regular", ReportTokenID.SetupRegular);
setupKeywords.Add("compressed", ReportTokenID.SetupCompressed);
setupKeywords.Add("margin", ReportTokenID.SetupMargin);
setupKeywords.Add("commas", ReportTokenID.SetupCommas);
setupKeywords.Add("scale", ReportTokenID.SetupScale);
setupKeywords.Add("unlimited", ReportTokenID.SetupUnlimited);
pagesizeKeywords.Add("lines", ReportTokenID.PageSizeLines);
pagesizeKeywords.Add("columns", ReportTokenID.PageSizeColumns);
colHeadKeywords.Add("date", ReportTokenID.ColHeadDate);
colHeadKeywords.Add("number", ReportTokenID.ColHeadNumber);
variableLineKeywords.Add("as", ReportTokenID.VariablesAs);
variableLineKeywords.Add("columns", ReportTokenID.VariablesColumns);
variableLineKeywords.Add("rows", ReportTokenID.VariablesRows);
variableLineKeywords.Add("for", ReportTokenID.VariablesFor);
variableLineKeywords.Add("monthly", ReportTokenID.VariablesMonthly);
// default state (and section states)
states.Add(ReportLexicalStateID.Default, new DefaultLexicalState(ReportLexicalStateID.Default, "Default"));
states.Add(ReportLexicalStateID.ConstantsSection, new DefaultLexicalState(ReportLexicalStateID.ConstantsSection, "ConstantsSection"));
states.Add(ReportLexicalStateID.FormulasSection, new DefaultLexicalState(ReportLexicalStateID.FormulasSection, "FormulasSection"));
states.Add(ReportLexicalStateID.VariablesSection, new DefaultLexicalState(ReportLexicalStateID.VariablesSection, "VariablesSection"));
// this one doesn't care, since it's always going to go to VariablesSection when parsed.
states.Add(ReportLexicalStateID.VariablesLine, new DefaultLexicalState(ReportLexicalStateID.VariablesLine, "VariablesLine"));
// previous state default, current state x (line type)
int prev = ReportLexicalStateID.Default;
AddStatePair(prev, ReportLexicalStateID.ColHeadLine);
AddStatePair(prev, ReportLexicalStateID.HeaderLine);
AddStatePair(prev, ReportLexicalStateID.LineBreakLine);
AddStatePair(prev, ReportLexicalStateID.PageSizeLine);
AddStatePair(prev, ReportLexicalStateID.SetupLine);
AddStatePair(prev, ReportLexicalStateID.TextLine);
prev = ReportLexicalStateID.ConstantsSection;
AddStatePair(prev, ReportLexicalStateID.ColHeadLine);
AddStatePair(prev, ReportLexicalStateID.HeaderLine);
AddStatePair(prev, ReportLexicalStateID.LineBreakLine);
AddStatePair(prev, ReportLexicalStateID.PageSizeLine);
AddStatePair(prev, ReportLexicalStateID.SetupLine);
AddStatePair(prev, ReportLexicalStateID.TextLine);
prev = ReportLexicalStateID.FormulasSection;
AddStatePair(prev, ReportLexicalStateID.ColHeadLine);
AddStatePair(prev, ReportLexicalStateID.HeaderLine);
AddStatePair(prev, ReportLexicalStateID.LineBreakLine);
AddStatePair(prev, ReportLexicalStateID.PageSizeLine);
AddStatePair(prev, ReportLexicalStateID.SetupLine);
AddStatePair(prev, ReportLexicalStateID.TextLine);
prev = ReportLexicalStateID.VariablesSection;
AddStatePair(prev, ReportLexicalStateID.ColHeadLine);
AddStatePair(prev, ReportLexicalStateID.HeaderLine);
AddStatePair(prev, ReportLexicalStateID.LineBreakLine);
AddStatePair(prev, ReportLexicalStateID.PageSizeLine);
AddStatePair(prev, ReportLexicalStateID.SetupLine);
AddStatePair(prev, ReportLexicalStateID.TextLine);
}
public ReportLexer()
{
}
public ITokenLexicalParseData GetLexicalStateDefaultTokenLexicalParseData(ITextBufferReader reader, ILexicalState lexicalState)
{
reader.Read();
return new LexicalStateAndIDTokenLexicalParseData(lexicalState, (byte)lexicalState.DefaultTokenID);
}
public int ParseLineDirective(ITextBufferReader reader, ref int lexicalStateIDnext)
{
int startOfs = reader.Offset;
// already got the '#'...
while (char.IsLetter(reader.Peek()))
{
// consume the char and add it to the token.
reader.Read();
}
// now, the directive text is at range from startOfs to reader.Offset
string directive = reader.GetSubstring(startOfs, reader.Offset - startOfs).ToLower();
// check the directive against known directives
switch (directive)
{
case "setup":
lexicalStateIDnext = ReportLexicalStateID.SetupLine;
return ReportTokenID.Setup;
case "pagesize":
lexicalStateIDnext = ReportLexicalStateID.PageSizeLine;
return ReportTokenID.PageSize;
case "header":
lexicalStateIDnext = ReportLexicalStateID.HeaderLine;
return ReportTokenID.Header;
case "text":
lexicalStateIDnext = ReportLexicalStateID.TextLine;
return ReportTokenID.Text;
case "variables":
lexicalStateIDnext = ReportLexicalStateID.VariablesLine;
return ReportTokenID.Variables;
case "line":
lexicalStateIDnext = ReportLexicalStateID.LineBreakLine;
return ReportTokenID.Line;
case "colhead":
lexicalStateIDnext = ReportLexicalStateID.ColHeadLine;
return ReportTokenID.ColHead;
case "constants":
lexicalStateIDnext = ReportLexicalStateID.ConstantsSection;
return ReportTokenID.Constants;
case "formulas":
lexicalStateIDnext = ReportLexicalStateID.FormulasSection;
return ReportTokenID.Formulas;
case "column":
return ReportTokenID.Column;
}
return ReportTokenID.Invalid;
}
public MatchType GetNextTokenLexicalParseData(ITextBufferReader reader, ILexicalState lexicalState, ref ITokenLexicalParseData lexicalParseData)
{
// Initialize
int tokenID = ReportTokenID.Invalid;
int lexicalStateIDcurrent = lexicalState.ID;
int lexicalStateIDprevious = (lexicalState.ID >> 8) & 0xFF;
int lexicalStateIDnext = lexicalState.ID;
bool handled = false;
// Get the next character
int startOfs = reader.Offset;
char ch = reader.Read();
// handle stuff that's valid for any state
handled = true;
switch (ch)
{
case '#':
tokenID = ParseLineDirective(reader, ref lexicalStateIDnext);
break;
case ';':
tokenID = ParseSingleLineComment(reader);
break;
default:
// mark this item as unhandled.
handled = false;
break;
}
// handle whitespace
if (ch != '\n' && char.IsWhiteSpace(ch))
{
ch = reader.Peek();
while (!reader.IsAtEnd && ch != '\n' && char.IsWhiteSpace(ch))
{
reader.Read();
ch = reader.Peek();
}
tokenID = ReportTokenID.Whitespace;
handled = true;
}
// handle state-specific items.
if (!handled)
{
Dictionary<string, int> keywordDict = null;
bool supportsString = false;
// figure out how to parse this state
switch (lexicalStateIDcurrent)
{
case ReportLexicalStateID.Default:
// handle stuff for the default state.
// no valid tokens that aren't handled above.
break;
case ReportLexicalStateID.SetupLine:
keywordDict = setupKeywords;
supportsString = false;
goto case -1;
case ReportLexicalStateID.PageSizeLine:
keywordDict = pagesizeKeywords;
supportsString = false;
goto case -1;
case ReportLexicalStateID.ColHeadLine:
keywordDict = colHeadKeywords;
supportsString = true;
goto case -1;
case ReportLexicalStateID.VariablesLine:
keywordDict = variableLineKeywords;
supportsString = false;
goto case -1;
case -1:
// handle one of the "line" states with keywords
if (char.IsLetter(ch))
{
do ch = reader.Read(); while (!reader.IsAtEnd && char.IsLetter(ch));
if (!reader.IsAtEnd) reader.ReadReverse();
// get the keyword in a string
string keyword = reader.GetSubstring(startOfs, reader.Offset - startOfs).ToLower();
// check against known keywords.
int value;
if (keywordDict.TryGetValue(keyword, out value))
tokenID = value;
else if (supportsString)
{
// this must be a string, so match the rest of it.
tokenID = parseString(reader);
}
else
{
// this must be an identifier, so read the rest and consider it so.
ch = reader.Peek();
while (!reader.IsAtEnd && ch != '\n' && char.IsLetterOrDigit(ch))
{
reader.Read();
ch = reader.Peek();
}
// consume '%' or '$' as last char of identifier
if (ch == '%' || ch == '$')
reader.Read();
tokenID = ReportTokenID.Id;
}
}
// handle an integer
else if (char.IsDigit(ch))
tokenID = ParseNumber(reader, ch);
else if (ch == '\n')
{
// exit this state to the previous state.
lexicalStateIDnext = lexicalStateIDprevious;
tokenID = ReportTokenID.LineTerminator;
}
break;
case ReportLexicalStateID.LineBreakLine:
if (char.IsDigit(ch))
tokenID = ParseNumber(reader, ch);
break;
case ReportLexicalStateID.HeaderLine:
case ReportLexicalStateID.TextLine:
if (char.IsLetterOrDigit(ch) || ch == '>')
{
tokenID = parseString(reader);
}
else if (ch == '\n')
{
// exit this state to the previous state.
lexicalStateIDnext = lexicalStateIDprevious;
tokenID = ReportTokenID.LineTerminator;
}
break;
case ReportLexicalStateID.ConstantsSection:
// don't support anything here...
break;
case ReportLexicalStateID.VariablesSection:
// don't support anything here (for now...)
break;
case ReportLexicalStateID.FormulasSection:
// don't support anything here (for now...)
break;
}
}
DefaultLexicalState nextState;
int stateID = MakeStateId(lexicalStateIDcurrent, lexicalStateIDnext);
if (!states.TryGetValue(stateID, out nextState))
{
if (!states.TryGetValue(lexicalStateIDnext, out nextState))
throw new Exception("Unknown state ID: " + lexicalStateIDnext.ToString());
}
if (tokenID != ReportTokenID.Invalid)
{
lexicalParseData = new LexicalStateAndIDTokenLexicalParseData(nextState, (byte)tokenID);
return MatchType.ExactMatch;
}
else
{
reader.ReadReverse();
return MatchType.NoMatch;
}
}
Kelly Leahy Software Architect Milliman, USA