|
|
A simple lexical scanner in Java
Lexical parsing of a character stream is a key issue whenever a program needs to process
a textual input.
The Lexer class, below,
streamlines the task of matching of regular-expression against the input, as well as that of producing
Token objects that precisely describe the matched string and its location within the input stream.
import java.io.*;
import java.util.Vector;
import java.util.regex.*;
public class Lexer
{
public static class Location
{
public final int line;
public final int column;
public Location(int line, int column)
{
this.line = line;
this.column = column;
}
public String toString()
{
return line + ":" + column;
}
}
public static class Token
{
public Token(String image, int id, Location begin, Location end)
{
this.id = id;
this.image = image;
this.begin = begin;
this.end = end;
}
public final int id;
public final String image;
public final Location begin;
public final Location end;
public String toString()
{
return "['" + image + "' id=" + id + " " + begin + ".." + end + "]";
}
}
private CharSequence input;
private int where = 0;
private final int[] lineStartOffsets;
private Location locationOf(int offset)
{
for(int ln = 0; ln < lineStartOffsets.length; ++ln)
{
int curr = lineStartOffsets[ln];
if(curr == offset)
return new Location(ln, offset - curr);
if(curr > offset)
{
int col0 = lineStartOffsets[ln-1];
return new Location(ln-1, offset - col0);
}
}
assert false;
return null;
}
public Lexer(String s)
{
input = s;
Vector<Integer> ints = new Vector<Integer>();
ints.add(0);
CharSequence cs = input;
for(int offset = 0; offset < cs.length(); ++offset)
{
char c = cs.charAt(offset);
if(c != '\n')
continue;
ints.add(offset+1);
}
ints.add(cs.length());
this.lineStartOffsets = new int[ints.size()];
int ln = -1;
for(int curr : ints)
{
++ln;
lineStartOffsets[ln] = curr;
}
}
public Lexer(InputStream is) throws IOException
{
this(new InputStreamReader(is));
}
private static String makeStr(Reader r) throws IOException
{
StringBuilder sb = new StringBuilder();
while(true)
{
int n = r.read();
if(n < 0)
break;
char c = (char) n;
sb.append(c);
}
return sb.toString();
}
public Lexer(Reader r) throws IOException
{
this(makeStr(r));
}
public Token next(Pattern p)
{
Matcher m = p.matcher(input);
boolean b = m.find();
if(!b)
return null;
MatchResult mr = m.toMatchResult();
if(m.start() != 0)
return null;
String s = input.subSequence(mr.start(), mr.end()).toString();
Token result = new Token(s, 0, locationOf(where + mr.start()),
locationOf(where + mr.end()));
input = input.subSequence(mr.end(), input.length());
where += mr.end();
return result;
}
public Token next(String regexp)
{
return next(Pattern.compile(regexp));
}
}
|
|