一雏蛮、實(shí)驗(yàn)?zāi)康?/h2>
通過(guò)本實(shí)驗(yàn)的編程實(shí)踐,使學(xué)生了解詞法分析的任務(wù)阱州,掌握詞法分析程序設(shè)計(jì)的原理和構(gòu)造方法挑秉,使學(xué)生對(duì)編譯的基本概念、原理和方法有完整的和清楚的理解苔货,并能正確地犀概、熟練地運(yùn)用立哑。
二、實(shí)驗(yàn)內(nèi)容
用 VC++/VB/JAVA 語(yǔ)言實(shí)現(xiàn)對(duì) C 語(yǔ)言子集的源程序進(jìn)行詞法分析姻灶。通過(guò)輸入源程序從左到右對(duì)字符串進(jìn)行掃描和分解铛绰,依次輸出各個(gè)單詞的內(nèi)部編碼及單詞符號(hào)自身值;若遇到錯(cuò)誤則顯示“Error”产喉,然后跳過(guò)錯(cuò)誤部分繼續(xù)顯示 捂掰;同時(shí)進(jìn)行標(biāo)識(shí)符登記符號(hào)表的管理。
三曾沈、程序描述
程序達(dá)到的狀態(tài)轉(zhuǎn)換圖如下:
package exp1;
import java.util.*;
import java.io.IOException;
import java.io.StringReader;
import static java.lang.Character.isDigit;
import static java.lang.Character.isLetter;
import static java.lang.Character.isLetterOrDigit;
import static java.lang.Character.isWhitespace;
/**
* 詞法分析器
*
* @version 2020-05-18
*/
public class Tokenizer {
/** 讀入的字符序列 */
private final StringReader src;
/** Tokenizer的狀態(tài)是否已經(jīng)分析過(guò) */
private boolean hasAnalysised;
/** 表示當(dāng)前是否到達(dá)結(jié)尾 */
private boolean eof;
/** 字符當(dāng)前所處的行數(shù) */
private int line;
/** 字符當(dāng)前行所處的位置 */
private int character;
/** 當(dāng)前字符在字符序列中的位置这嚣,實(shí)際應(yīng)和StringReader的私有屬性next相同 */
private int index;
/** 當(dāng)前字符的前一個(gè)字符 */
private char previous;
/** 標(biāo)志當(dāng)前是不是已經(jīng)使用了前一個(gè)字符 */
private boolean usePrevious;
/** 上一行的長(zhǎng)度 */
private int characterPreviousLine;
/** 關(guān)鍵詞表 */
final String[] keywords = new String[] { "do", "end", "for", "if", "printf", "scanf", "then", "while" };
/** 關(guān)鍵詞集合,為了快速判斷 */
final Set<String> keywordsSet = new HashSet<>(Arrays.asList(keywords));
/** 分隔符起始字符的集合 */
final Set<Character> delimitersSet = new HashSet<>(Arrays.asList(',', ';', '(', ')', '[', ']'));
/** 算數(shù)運(yùn)算符起始字符的集合 */
final Set<Character> operatorsASet = new HashSet<>(Arrays.asList('+', '-', '*', '/'));
/** 常量列表 */
final List<Number> constants = new ArrayList<>();
/** 標(biāo)識(shí)符列表 */
final Set<String> identifiers = new LinkedHashSet<>();
/**
* 詞法分析器的構(gòu)造器
*
* @param src 源代碼
*/
public Tokenizer(String src) {
this.src = new StringReader(src);
this.hasAnalysised = false;
this.line = this.character = 1;
this.index = this.characterPreviousLine = 0;
this.previous = 0;
this.eof = this.usePrevious = false;
}
/**
* 檢查是否已經(jīng)分析過(guò)
*
* @return 如果已經(jīng)分析過(guò)塞俱,返回true
*/
public boolean hasAnalysised() {
return hasAnalysised;
}
/**
* 讀取出一個(gè)字符串
*
* @return 讀取出的字符串
* @throws IOException
*/
private String nextString() throws IOException {
StringBuilder sb = new StringBuilder();
char c;
for (;;) {
c = this.next();
/*
* 以字母開(kāi)頭不是關(guān)鍵詞就是標(biāo)識(shí)符姐帚,該方法讀出一個(gè)符合正規(guī)式/[A-Za-z]([A-Za-z0-9]*)/的字符串
* 即以字母開(kāi)頭,字母和數(shù)字匹配零次或多次的字符串
*/
if (isLetterOrDigit(c)) {
sb.append(c);
} else if (c == 0) {
return sb.toString();
} else {
this.back();
return sb.toString();
}
}
}
/**
* 讀取出一個(gè)數(shù)字字符串
*
* @return 讀取出的數(shù)字字符串
* @throws IOException
*/
private String nextNumberString() throws IOException {
StringBuilder sb = new StringBuilder();
char c;
for (;;) {
c = this.next();
/*
* 以數(shù)字開(kāi)頭可能為常量障涯,或者為出錯(cuò)的單詞罐旗,此方法讀取出一個(gè)符合正規(guī)式/([0-9])([0-9.]*)/,即以0-9開(kāi)頭唯蝶,0-9和.匹配零次或多次的字符串
* 對(duì)于末尾是.的或者是出現(xiàn)多次.的九秀,表示出現(xiàn)錯(cuò)誤,交由numberAnalysis方法處理
*/
if (isDigit(c)) {
sb.append(c);
} else if (c == '.') {
sb.append(c);
} else if (c == 0) {
return sb.toString();
} else {
this.back();
return sb.toString();
}
}
}
/**
* 解析出字符串表示的數(shù)字
*
* @param numString 數(shù)字字符串
* @return 數(shù)字字符串代表的值
*/
private Number parseNumber(String numString) {
/* 傳進(jìn)來(lái)的字符串生棍,不是標(biāo)準(zhǔn)的整數(shù)颤霎,就是標(biāo)準(zhǔn)的浮點(diǎn)數(shù),不會(huì)發(fā)生NumberFormatException */
try {
return new Integer(numString);
} catch (NumberFormatException e) {
return new Double(numString);
}
}
/**
* 獲取下一個(gè)非空白字符
*
* @return 到達(dá)末尾后返回0涂滴,下一個(gè)非空白字符
* @throws IOException
*/
private char nextClean() throws IOException {
for (;;) {
char c = this.next();
if (c == 0 || !isWhitespace(c)) {
return c;
}
}
}
/**
* 獲取源代碼的下一個(gè)字符
*
* @return 到達(dá)末尾后返回0,否則返回源代碼的下一個(gè)字符
* @throws IOException
*/
private char next() throws IOException {
int c;
/* 如果回退過(guò)晴音,返回回退的字符 */
if (this.usePrevious) {
this.usePrevious = false;
c = this.previous;
} else {
/* 否則柔纵,讀取新的字符 */
c = this.src.read();
}
/* 判斷是不是到達(dá)結(jié)尾 */
if (c <= 0) { // 到達(dá)末尾
this.eof = true;
return 0;
}
this.incrementIndexes(c);
this.previous = (char) c;
return this.previous;
}
/**
* 增加{@link #index}并處理?yè)Q行,此方法僅應(yīng)當(dāng)在{@link #next()}中調(diào)用
*
* @param c 讀入的字符
*/
private void incrementIndexes(int c) {
if (c > 0) {
this.index++;
/* 因?yàn)檫\(yùn)行系統(tǒng)不同锤躁,行尾序列會(huì)有'\n' (*inx, Mac OS 10+), '\r\n' (Windows), '\r' (Mac OS 9-)搁料,需做處理*/
if (c == '\r') {
this.line++;
this.characterPreviousLine = this.character;
this.character = 0;
} else if (c == '\n') {
if (this.previous != '\r') {
this.line++;
this.characterPreviousLine = this.character;
}
this.character = 0;
} else {
this.character++;
}
}
}
/**
* 回退一個(gè)字符,用于實(shí)現(xiàn)超前掃描系羞。
*
* @throws IOException 如果已經(jīng)在字符串起始郭计,或者已經(jīng)回退過(guò)一次
*/
public void back() throws IOException {
if (this.usePrevious) {
throw new IOException("回退兩次不允許");
} else if (this.index <= 0) {
throw new IOException("已經(jīng)在開(kāi)頭位置,無(wú)法回退");
}
this.decrementIndexes();
this.usePrevious = true;
this.eof = false;
}
/**
* 回退{(lán)@link #index}并處理?yè)Q行椒振,此方法僅應(yīng)當(dāng)在{@link #back()}中調(diào)用
*/
private void decrementIndexes() {
this.index--;
if (this.previous == '\r' || this.previous == '\n') {
this.line--;
this.character = this.characterPreviousLine;
} else if (this.character > 0) {
this.character--;
}
}
/**
* 進(jìn)行詞法分析昭伸,返回分析列表
*
* @return 單詞分析結(jié)果的列表
* @throws IOException
*/
public List<WordExp> analysis() throws IOException {
this.hasAnalysised = true;
List<WordExp> res = new ArrayList<>();
int c;
/* c為0表示已經(jīng)分析到結(jié)尾,采用 nextClean 方法略過(guò)空白字符 */
while ((c = nextClean()) != 0) {
if (isLetter(c)) {
this.back();
this.analysisString(res); /* 處理字母開(kāi)頭 */
} else if (isDigit(c)) {
this.back();
this.analysisNumber(res); /* 處理數(shù)字開(kāi)頭 */
} else {
this.back();
this.analysisOther(res); /* 處理其他字符開(kāi)頭 */
}
}
return res;
}
/**
* 分析以字母開(kāi)頭的符號(hào)串
*
* @param res 分析結(jié)果列表
* @throws IOException
*/
private void analysisString(List<WordExp> res) throws IOException {
/* 讀出一個(gè)以字母開(kāi)頭澎迎,之后含有字母和數(shù)字零次或多次的字符串 */
String cur = this.nextString();
/* 如果讀出的字符串為關(guān)鍵詞 */
if (keywordsSet.contains(cur)) {
res.add(new WordExp(cur, WordType.KEYWORD, this.line, this.character));
} else if (identifiers.contains(cur)) {
/* 如果讀出的字符串已經(jīng)存在在標(biāo)識(shí)符列表中 */
res.add(new WordExp(cur, WordType.IDENTIFIER, this.line, this.character));
} else {
/* 如果不存在在標(biāo)識(shí)符列表中庐杨,假如標(biāo)識(shí)符列表 */
identifiers.add(cur);
res.add(new WordExp(cur, WordType.IDENTIFIER, this.line, this.character));
}
}
/**
* 分析以數(shù)字開(kāi)頭的符號(hào)串
*
* @param res 分析結(jié)果列表
* @throws IOException
*/
private void analysisNumber(List<WordExp> res) throws IOException {
/* 讀出一個(gè)以數(shù)字開(kāi)頭选调,之后含有數(shù)字和.零次或多次的字符串 */
String numString = this.nextNumberString();
/* 超前掃描下一個(gè)字符 */
char c = this.next();
if (!isLetter(c)) {
if (c != 0)
this.back();
/* 如果以.結(jié)尾,表示出現(xiàn)了123.這種錯(cuò)誤 */
if (numString.endsWith(".")) {
res.add(new WordExp(numString, WordType.ERROR, this.line, this.character));
return;
}
/* 如果字符串含有兩個(gè)及以上的.灵份,說(shuō)明出現(xiàn)了類似123.123.123這種錯(cuò)誤 */
else if (numString.indexOf('.') != numString.lastIndexOf('.')) {
res.add(new WordExp(numString, WordType.ERROR, this.line, this.character));
return;
}
/* 否則仁堪,是正常的常數(shù)串 */
constants.add(parseNumber(numString));
res.add(new WordExp(numString, WordType.CONSTANT, this.line, this.character));
return;
}
/* 如果下一個(gè)字符是字母的話,說(shuō)明會(huì)出現(xiàn)形如123abc這種類似的錯(cuò)誤*/
this.back();
/* 讀取出后面的那個(gè)部分字符串 */
String cur = this.nextString();
cur = numString + cur;
res.add(new WordExp(cur, WordType.ERROR, this.line, this.character));
}
/**
* 分析以非字母和數(shù)字開(kāi)頭的符號(hào)串
*
* @param res 分析結(jié)果列表
* @throws IOException
*/
private void analysisOther(List<WordExp> res) throws IOException {
char c = this.next();
char c2;
if (c == '/') {
/* 如果以/開(kāi)頭填渠,表示可能是注釋弦聂,除法符號(hào) */
c2 = this.next();
/* 如果下一個(gè)字符是/,表示為注釋氛什,而且注釋到下一個(gè)回車或者結(jié)束莺葫,不用記錄 */
if (c2 == '/') {
for (;;) {
c2 = this.next();
if (c2 == '\r' || c2 == '\n' || c2 == '0')
return;
}
/* 如果下一個(gè)字符是*,表示為注釋屉更,注釋到下一個(gè)*和/徙融,不用記錄 */
} else if (c2 == '*') {
StringBuilder con = new StringBuilder("/*");
for (;;) {
c2 = this.next();
con.append(c2);
/* 一直讀取并鸵,直到*和/ */
if (c2 == '*') {
c2 = this.next();
con.append(c2);
if (c2 == '/')
return;
else
continue;
/* 但是如果未找到匹配的*和/就達(dá)到了結(jié)尾娩梨,表示注釋出錯(cuò)剥啤,要添加記錄 */
} else if (c2 == 0) {
res.add(new WordExp(con.toString(), WordType.ERROR, this.line, this.character));
return;
}
}
/* 如果不是/和*覆醇,表示是除法符號(hào) */
} else {
res.add(new WordExp("/", WordType.OPERATOR_A, this.line, this.character));
return;
}
/* 如果讀取到的符號(hào)是分隔符 */
} else if (delimitersSet.contains(c)) {
res.add(new WordExp(c + "", WordType.DELIMITER, this.line, this.character));
return;
}
/* 如果讀取到的符號(hào)是算數(shù)運(yùn)算符 */
else if (operatorsASet.contains(c)) {
res.add(new WordExp(c + "", WordType.OPERATOR_A, this.line, this.character));
return;
/* 如果是邏輯運(yùn)算符蒸健,因?yàn)檫壿嬤\(yùn)算符包含<=和>=以及<>淮韭,也要超前搜索 */
} else if (c == '<') {
c2 = this.next();
/* 如果是<> */
if (c2 == '>') {
res.add(new WordExp("<>", WordType.OPERATOR_L, this.line, this.character));
return;
/* 如果是<= */
} else if (c2 == '=') {
res.add(new WordExp("<=", WordType.OPERATOR_L, this.line, this.character));
return;
} else {
/* 如果是< */
this.back();
res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
return;
}
} else if (c == '>') {
c2 = this.next();
/* 如果是>= */
if (c2 == '=') {
res.add(new WordExp(">=", WordType.OPERATOR_L, this.line, this.character));
return;
/** 如果是> */
} else {
this.back();
res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
return;
}
/* 如果是= */
} else if (c == '=') {
res.add(new WordExp(c + "", WordType.OPERATOR_L, this.line, this.character));
return;
/* 如果什么也不是窗慎,出錯(cuò) */
} else {
res.add(new WordExp(c + "", WordType.ERROR, this.line, this.character));
return;
}
}
public static void main(String[] args) throws IOException {
Scanner scanner = new Scanner(System.in);
scanner.useDelimiter("(\r\n\r\n)|(\n\n)|(\r\r)");
String src = scanner.next();
Tokenizer tokenizer = new Tokenizer(src);
List<WordExp> wordExps = tokenizer.analysis();
// System.out.println(wordExps);
System.out.printf("%-16s%-16s%-16s%-16s", "單詞", "二元序列", "類 型", "位置(行铺坞,列)");
System.out.println();
System.out.printf("%-16s(單詞種別渤早,單詞屬性)", "");
System.out.println();
for (WordExp word : wordExps) {
if (word.type != WordType.ERROR) {
System.out.printf("%-16s%-16s%-16s%-16s", word.word, "(" + word.type.id + ", " + word.word + ")",
word.type.type, "(" + word.line + ", " + word.column + ")");
System.out.println();
} else {
System.out.printf("%-16s%-16s%-16s%-16s", word.word, "Error", "Error",
"(" + word.line + ", " + word.column + ")");
System.out.println();
}
}
scanner.close();
}
@Override
public String toString() {
return " at " + this.index + " [row " + this.line + " column " + this.character + "]";
}
}
enum WordType {
/** 關(guān)鍵詞 */
KEYWORD(1, "關(guān)鍵詞"),
/** 分隔符 */
DELIMITER(2, "分隔符"),
/** 算數(shù)運(yùn)算符 */
OPERATOR_A(3, "算術(shù)運(yùn)算符"),
/** 關(guān)系運(yùn)算符 */
OPERATOR_L(4, "關(guān)系運(yùn)算符"),
/** 常量 */
CONSTANT(5, "常量"),
/** 標(biāo)識(shí)符 */
IDENTIFIER(6, "標(biāo)識(shí)符"),
/** 錯(cuò)誤 */
ERROR(7, "錯(cuò)誤");
public int id;
public String type;
private WordType(int id, String type) {
this.id = id;
this.type = type;
}
@Override
public String toString() {
return this.type;
}
};
final class WordExp {
/** 單詞 */
final String word;
/** 單詞類型 */
final WordType type;
/** 單詞所處的行 */
final int line;
/** 單詞所處的列 */
final int column;
public WordExp(String word, WordType type, int line, int column) {
this.word = word;
this.type = type;
this.line = line;
this.column = column;
}
@Override
public String toString() {
return type.toString() + word + "职车,出現(xiàn)在 " + " (" + this.line + "," + this.column + ")";
}
}