福大软工1816 · 第五次作业

1.结对同学的博客链接、本作业博客的链接、所Fork的同名仓库的Github项目地址

Github地址：https://github.com/Stellaaa18/pair-project

结对同学的博客链接：http://www.cnblogs.com/Stella12/p/9769114.html

本作业博客的链接：https://edu.cnblogs.com/campus/fzu/Grade2016SE/homework/2138

2.分工

　　杨礼亮：负责爬虫、处理数据；分权重词频统计部分代码；实现字符、行数统计部分代码；

　　（队友）林翔宇：负责实现多参数混合使用功能实现；词组、单词、统计部分代码；负责整合代码；性能分析；

　　各自写自己负责部分的博客。

3.PSP表格

PSP2.1	Personal Software Process Stages	预估耗时（分钟）	实际耗时（分钟）
Planning	计划	20	30
· Estimate	· 估计这个任务需要多少时间	20	30
Development	开发	750	820
· Analysis	· 需求分析 (包括学习新技术)	200	230
· Design Spec	· 生成设计文档	30	40
· Design Review	· 设计复审	30	10
· Coding Standard	· 代码规范 (为目前的开发制定合适的规范)	100	60
· Design	· 具体设计	200	300
· Coding	· 具体编码	100	100
· Code Review	· 代码复审	30	20
· Test	· 测试（自我测试，修改代码，提交修改）	60	60
Reporting	报告	80	100
· Test Repor	· 测试报告	60	60
· Size Measurement	· 计算工作量	5	5
· Postmortem & Process Improvement Plan	· 事后总结, 并提出过程改进计划	15	35
	合计	850	950

4. 使用工具爬取论文信息

爬虫工具：八爪鱼采集器

使用方法：

　　这个工具使用起来很简易。

　　首先建立一个自定义采集，输入将要采集的网页网址，保存网址后便会自动跳转。

　　第二，我将标题与简介分开采集，采集标题只需要在本页面直接采集，采集简介要设计循环链接。

　　采集完成后就可以以excel格式导出，再将excel中的数据粘贴到文本中。

　　我是将标题和简介放到两个文本中，采集后的数据比较混乱，需要按格式输出，所以用c语言写了一些程序来整理文本。

这里附上一些伪代码

//在简介前面加上 Abstract: 
void Abstract(char *filename)
{
    ifstream file;
    file.open(filename);
    
    string s;
    string t="Abstract: ";
    ofstream fileOutput;
    fileOutput.open("2.txt", ios::app);
    while(getline(file, s))
    {
        fileOutput << t+s << endl;
    }
    
    file.close();
}

//在标题前面加上 Title:  
void Title(char *filename)
{
    ifstream file;
    file.open(filename);
    
    string s;
    string t="Title: ";
    ofstream fileOutput;
    fileOutput.open("1.txt", ios::app);
    while(getline(file, s))
    {
        fileOutput << t+s << endl;
    }
    
    file.close();
}

//按格式输出
void Title(char *filename)
{
    ifstream file;
    file.open(filename);
    
    string s;
    int count= 979;
    for(int i=0;i<count;i++)
    {
        getline(file, s);
        title[i]=s;
    }
    file.close();
}

void Abstract(char *filename)
{
    ifstream file;
    file.open(filename);
    
    string s;
        int count= 979;
    for(int i=0;i<count;i++)
    {
        getline(file, s);
        abstract[i]=s;
    }
    file.close();
}

int main()
{
    ofstream fileOutput;
    fileOutput.open("result.txt", ios::app);
    char filename1[105]= "1.txt";
    char filename2[105]= "2.txt";
    Title(filename1);
    Abstract(filename2);
    int count= 979;
    for(int i=0;i<count;i++)
    { 
        fileOutput<<i<<endl;
        fileOutput<<title[i]<<endl;
        fileOutput<<abstract[i]<<endl;
        fileOutput<<endl<<endl;
    }
    return 0;
}

　　最终结果如下

5.代码组织与内部实现设计（类图）

分装成五个接口函数，分别实现多参数混合使用、字符统计、行数统计、词组统计、分权重词频统计

int AnalyseParameter(char **argv);
int CountChar(string name);
int CountLines(string name);
void PhraseFreq(string str);
void TopNPhrases();
int WordFreq(string s);
void TopNWords();

内部实现如下：

类图：

6.算法的关键与关键实现部分流程图

本次任务的关键在于词组统计的实现，实现部分代码如下：

void PhraseFreq(string s)
{
    string word = "";
    string phrase = "";
    int cnt = 0, secWord;
    int len = s.length();
    for (int i = 0; i < len; i++) {
        if (s[i] >= 'A' && s[i] <= 'Z') {
            s[i] += 32; //change into lower case
        }
        if ((s[i] >= '0'&&s[i] <= '9') || (s[i] >= 'a'&&s[i] <= 'z')) {
            word += s[i];
            if (i == len - 1 && IsWord(word)) {
                cnt++;
                if (cnt == 1) {
                    phrase += word;
                }
                else {
                    phrase += " ";
                    phrase += word;
                }
                if (cnt == 2) {
                    secWord = i - word.size() - 1;
                }
                if (cnt == phraseLen) {
                    if (weightOnWord && inTitle) {
                        mapPhrase[phrase] += 10;
                    }
                    else
                        mapPhrase[phrase]++;
                    i = secWord;
                }
            }
        }
        else if (word != "") {
            if (IsWord(word)) {
                cnt++;
                if (cnt == 1) {
                    phrase += word;
                }
                else {
                    phrase += " ";
                    phrase += word;
                }
                if (cnt == 2) {
                    secWord = i - word.size() - 1;
                }
                if (cnt == phraseLen) {
                    if (weightOnWord && inTitle) {
                        mapPhrase[phrase] += 10;
                    }
                    else
                        mapPhrase[phrase]++;
                    i = secWord;
                    cnt = 0;
                    phrase = "";
                }
            }
            else {
                cnt = 0;
                phrase = "";
            }
            word = "";
        }

    }
}

流程图：

7.关键代码解释

以下是主函数的部分代码，用来处理输入文件

int main()
{
        ifstream fin(inputFile); 
    string str;
    while (getline(fin, str)) { //按行读取文件内容 
        inTitle = 0;
        if (str.substr(0, 7) == "Title: ") {//"Title: "不计算在内
            str = str.substr(7, str.length());
            inTitle = 1;
        }
        else if (str.substr(0, 10) == "Abstract: ") {//"Abstract: "不计算在内
            str = str.substr(10, str.length());
        }
        else if (IsNum(str[0]) || str==""){ //不统计论文编号和空行
            continue;
        }
        if (countPhrase) {//countPhrase=1,启用词组词频统计
            PhraseFreq(str);
        }
        characters += CountChar(str);
        words += WordFreq(str);
        lines += CountLines(str);
    }
    fin.close();

        return 0;

}
```
//统计一行中的词组词频
```
void PhraseFreq(string s)
{
    string word = "";
    string phrase = "";
    int cnt = 0, secWord;
    int len = s.length();
    for (int i = 0; i < len; i++) {
        if (s[i] >= 'A' && s[i] <= 'Z') {
            s[i] += 32; //大写转小写
        }
              //遍历到最后一个字符，和遇到分隔符一样，要判断当前词组是否符合条件
        if ((s[i] >= '0'&&s[i] <= '9') || (s[i] >= 'a'&&s[i] <= 'z')) {
            word += s[i];
            if (i == len - 1 && IsWord(word)) { 
                cnt++;
                if (cnt == 1) {
                    phrase += word;
                }
                else {
                    phrase += " ";
                    phrase += word;
                }
                if (cnt == 2) {
                    secWord = i - word.size()-1;
                }
                if (cnt == phraseLen) {
                    if (weightOnWord && inTitle) {
                        mapPhrase[phrase] += 10;
                    }
                    else
                        mapPhrase[phrase]++;
                    i = secWord;
                }
            }
        }
        else if(word!=""){ //遇到分隔符且word不为空
            if (IsWord(word)) { //如果是有效单词
                cnt++;
                                //单词加到phrase中
                if (cnt == 1) {
                    phrase += word;
                }
                else {
                    phrase += " ";
                    phrase += word;
                }
                if (cnt == 2) { //记录词组第二个单词的位置
                    secWord = i - word.size()-1;
                }
                if (cnt == phraseLen) { 
                    if (weightOnWord && inTitle) { //若启用权重统计并且是属于title的单词
                        mapPhrase[phrase]+=10; //权重为10
                    }
                    else
                        mapPhrase[phrase]++;
                    i = secWord; //回到第二个单词首
                    cnt = 0;  //当前单词数清零
                    phrase = ""; //phrase清零
                }
            }
            else {
                cnt = 0;
                phrase = "";
            }
            word = "";
        }

    }
}

8.性能分析与改进

展示性能分析图和程序中消耗最大的函数
使用爬取的论文进行测试，输入命令行参数为WordCount.exe -i paper.txt -o result.txt -w 1 -m 3 -n 10，运行时间为18.716s,输出结果如下：

性能报告截图：

从中可以看出，消耗最大的函数是TopNPhrase(),占据总运行时间的49.1%。

9.单元测试

附上检测代码：

        TEST_METHOD(TestMethod1)
        {
            string name;
            name="chartest1.txt";
            int chars = CountChar(name);
            Assert::AreEqual(chars, 40);
        }
        TEST_METHOD(TestMethod2)
        {
            string name;
            name = "chartest2.txt";
            int chars = CountChar(name);
            Assert::AreEqual(chars, 74);
        }
        TEST_METHOD(TestMethod3)
        {
            string name;
            name = "wordtest1.txt";
            ifstream fin(name);
            int words = 0;
            string str;
            while (getline(fin, str)) {
                inTitle = 0;
                if (str.substr(0, 7) == "Title: ") {
                    str = str.substr(7, str.length());
                    inTitle = 1;
                }
                else if (str.substr(0, 10) == "Abstract: ") {
                    str = str.substr(10, str.length());
                }
                else if (IsNum(str[0]) || str == "") { //ignore paper number and empty lines
                    continue;
                }
                if (countPhrase) {
                    PhraseFreq(str);
                }
                words += WordFreq(str);
            }
            fin.close();
            Assert::AreEqual(words, 5);
        }
        TEST_METHOD(TestMethod4)
        {
            string name;
            name = "wordtest2.txt";
            ifstream fin(name);
            int words = 0;
            string str;
            while (getline(fin, str)) {
                inTitle = 0;
                if (str.substr(0, 7) == "Title: ") {
                    str = str.substr(7, str.length());
                    inTitle = 1;
                }
                else if (str.substr(0, 10) == "Abstract: ") {
                    str = str.substr(10, str.length());
                }
                else if (IsNum(str[0]) || str == "") { //ignore paper number and empty lines
                    continue;
                }
                if (countPhrase) {
                    PhraseFreq(str);
                }
                words += WordFreq(str);
            }
            fin.close();
            Assert::AreEqual(words, 9);
        }
        TEST_METHOD(TestMethod5)
        {
            string name;
            name = "linetest1.txt";
            ifstream fin(name);
            int lines = 0;
            string str;
            while (getline(fin, str)) {
                inTitle = 0;
                if (str.substr(0, 7) == "Title: ") {
                    str = str.substr(7, str.length());
                    inTitle = 1;
                }
                else if (str.substr(0, 10) == "Abstract: ") {
                    str = str.substr(10, str.length());
                }
                else if (IsNum(str[0]) || str == "") { //ignore paper number and empty lines
                    continue;
                }
                if (countPhrase) {
                    PhraseFreq(str);
                }
                lines += CountLines(str);
            }
            fin.close();
            Assert::AreEqual(lines, 5);
        }
        TEST_METHOD(TestMethod6)
        {
            string name;
            name = "linetest2.txt";
            ifstream fin(name);
            int lines = 0;
            string str;
            while (getline(fin, str)) {
                inTitle = 0;
                if (str.substr(0, 7) == "Title: ") {
                    str = str.substr(7, str.length());
                    inTitle = 1;
                }
                else if (str.substr(0, 10) == "Abstract: ") {
                    str = str.substr(10, str.length());
                }
                else if (IsNum(str[0]) || str == "") { //ignore paper number and empty lines
                    continue;
                }
                if (countPhrase) {
                    PhraseFreq(str);
                }
                lines += CountLines(str);
            }
            fin.close();
            Assert::AreEqual(lines, 5);
        }
        TEST_METHOD(TestMethod7)
        {
            string name;
            name = "characterstest.txt";
            ifstream fin(name);
            int characters = 0;
            string str;
            while (getline(fin, str)) {
                inTitle = 0;
                if (str.substr(0, 7) == "Title: ") {
                    str = str.substr(7, str.length());
                    inTitle = 1;
                }
                else if (str.substr(0, 10) == "Abstract: ") {
                    str = str.substr(10, str.length());
                }
                else if (IsNum(str[0]) || str == "") { //ignore paper number and empty lines
                    continue;
                }
                if (countPhrase) {
                    PhraseFreq(str);
                }
                characters += CountChar(str);
            }
            fin.close();
            Assert::AreEqual(characters, 5);
        }
        TEST_METHOD(TestMethod8)
        {
            string name;
            name = "phrasetest1.txt";
            ifstream fin(name);
            string str;
            while (getline(fin, str)) {
                inTitle = 0;
                if (countPhrase) {
                    PhraseFreq(str);
                }
            }
            fin.close();
            if (countPhrase) { //phrase statistics
                TopNPhrases();
            }
            else { //word statistics
                TopNWords();
            }
            for (auto x : mapPhrase) {
                vecPhrase.push_back(x);
            }
            sort(vecPhrase.begin(), vecPhrase.end(), cmp2);
            Assert::AreEqual(vecPhrase[1].first, string("monday tuesday wednesday"));
            Assert::AreEqual(vecPhrase[1].second, 11);
            
        }
        TEST_METHOD(TestMethod9)
        {
            string name;
            name = "phrasetest2.txt";
            ifstream fin(name);
            string str;
            while (getline(fin, str)) {
                inTitle = 0;
                if (countPhrase) {
                    PhraseFreq(str);
                }
            }
            fin.close();
            if (countPhrase) { //phrase statistics
                TopNPhrases();
            }
            else { //word statistics
                TopNWords();
            }
            for (auto x : mapPhrase) {
                vecPhrase.push_back(x);
            }
            sort(vecPhrase.begin(), vecPhrase.end(), cmp2);
            Assert::AreEqual(vecPhrase[1].first, string("convolutional neural networks"));
            Assert::AreEqual(vecPhrase[1].second, 196);
        }

10.遇到的代码模块异常或结对困难及解决方法

困难：

　　尝试用python写爬虫；

　　在结对合作的过程中，各部分的代码是两个人分开写的，在定义函数接口上出现的参数不一致的情况，合并代码麻烦；

　　在写代码的过程中两个人都没有注释代码的习惯，看对方的代码特别吃力，很多变量和代码逻辑意思不懂。

　　结对过程中联系很少，经常不知道对方的进度。

解决方法：

　　用python写爬虫还没实现；

　　彼此约定下次写各自代码时一定要注释清楚。

11.评价你的队友

　　值得学习：办事效率高，代码能力强大，技术稳稳的，全程带飞我，紧抱队友大腿。

　　需要改进的地方：合作过程沟通较少，彼此工作比较分离。

12.学习进度条

第N周	新增代码（行）	累计代码（行）	本周学习耗时(小时)	累计学习耗时（小时）	重要成长
5	200	600	30	80	1.熟悉了爬虫工具的使用 2.使用c++处理数据 3.写代码过程中学会了新的算法 4.提高了分工合作能力

200

600

1.熟悉了爬虫工具的使用

2.使用c++处理数据

3.写代码过程中学会了新的算法

4.提高了分工合作能力

13.贴出Github的代码签入记录

相关阅读:
题解 P4999 【烦人的数学作业】
题解 P2657 【[SCOI2009] windy 数】
题解【洛谷 P1466 [USACO2.2]集合 Subset Sums】
乘法逆元
 浅谈二维前缀和
 浅谈位运算
 浅谈 Lucas 定理
 浅谈 exgcd
【洛谷P1754 球迷购票问题】题解
 RPA机器人工厂化时代下，艺赛旗要做什么样的“四新”产品
原文地址：https://www.cnblogs.com/YangLiLiang/p/9752105.html