• 福大软工1816 · 第五次作业——结对作业2


    一、博客链接和Github链接

    郑孔宇 俞凯欣

    github项目地址

    二、具体分工:

    基本功能部分:郑孔宇
    爬虫工具、测试及附加题部分:俞凯欣

    三、PSP表格:

    PSP2.1 Personal Software Process Stages 预估耗时(分钟) 实际耗时(分钟)
    Planning 计划 10 10
    · Estimate · 估计这个任务需要多少时间 540 620
    Development 开发 0 0
    · Analysis · 需求分析 (包括学习新技术) 120 60
    · Design Spec · 生成设计文档 20 20
    · Design Review · 设计复审 20 20
    · Coding Standard · 代码规范 (为目前的开发制定合适的规范) 0 0
    · Design · 具体设计 20 20
    · Coding · 具体编码 220 360
    · Code Review · 代码复审 40 40
    · Test · 测试(自我测试,修改代码,提交修改) 60 60
    Reporting 报告 0 0
    · Test Repor · 测试报告 0 0
    · Size Measurement · 计算工作量 0 0
    · Postmortem & Process Improvement Plan · 事后总结, 并提出过程改进计划 30 3

    四、爬虫工具

    爬虫工具使用的八爪鱼,通过点击需要爬取的信息,来获取相同类型的信息进行选定操作,然后制作流程图来完成爬虫,爬取完毕后再进行导出操作即可。


    五、需求分析

    六、功能实现

    1.主函数
    • 获取命令行中的所有指令,并执行相关函数
    int main(int args, char* argv[])
    {
    	char* a = NULL;
    	char* b = NULL;
    	char* c = NULL;
    	char* d = NULL;
    	char* e = NULL;
    	string cstr, dstr, estr;
    	int i;
    	int w = 0, m = 0, n = 0;
    	for (i = 0; i < args; i++)
    	{
    		if (strcmp(argv[i], "-i") == 0)  //检测-i指令
    		{
    			a = argv[i + 1];
    		}
    		if (strcmp(argv[i], "-o") == 0) //检测-o指令
    		{
    			b = argv[i + 1];
    		}
    		if (strcmp(argv[i], "-w") == 0) //检测-w指令
    		{
    			c = argv[i + 1];
    			cstr = c;
    			w = atoi(cstr.c_str());
    		}
    		if (strcmp(argv[i], "-n") == 0) //检测-n指令
    		{
    			d = argv[i + 1];
    			dstr = d;
    			n = atoi(dstr.c_str()); 
    		}
    		if (strcmp(argv[i], "-m") == 0) //检测-m指令
    		{
    			e = argv[i + 1];
    			estr = e;
    			m = atoi(estr.c_str());
    		}
    	}
    	readtxt(a); //读取文件并获取所有字符数
    	divide_n(w); //获取行数 单词数 排除Title: Abstract: 和编号后的字符数,并分割、存入和排序单词
    	if (m == 0)
    	{		
    		writetxt_n(b, n); //输出无-m时候的格式
    	}
    	else
    	{
    		readtxt2(a); //读取文件不改变字符数,用于重新分割
    		divide_m(w, m); //分割单词若满足词组条件则存入并排序
    		writetxt_m(b); //输出有-m时候的格式
    	}
    }
    
    2.词频统计(divide_n) (writetxt_n)

    void divide_n(int w)
    {
    	size_t length;
    	string wordstring;
    	char wordchar[999];
    	int w2 = 1;
    	int i, j, k = 0;
    	int pos = 0;
    	const char *sep = "./;'[] \<>?:"{}|`~!@#$%^&*()_+-=
    "; //需要分割的字符
    	char *p;
    	char *buf;
    	p = strtok_s(s, sep, &buf);
    	while (p)
    	{
    		wordstring = p;
    		strcpy_s(wordchar, wordstring.c_str());
    		if (strcmp(wordchar, "Title") == 0)
    		{
    			charnum -= 11;
    			linenum++;
    			w2 = w;
    		} // 出现Title 权重为w;
    		else if (strcmp(wordchar, "Abstract") == 0)
    		{
    			charnum -= 10;
    			linenum++;
    			w2 = 1;
    		}// 出现Abstract 权重为1;
    		else
    		{
    			length = wordstring.length();
    			for (i = 0; i <= length; i++)
    			{
    				if (wordchar[i] >= 'A' && wordchar[i] <= 'Z')
    				{
    					wordchar[i] = wordchar[i] + 32;
    				}
    			}
    			wordstring = wordchar;
    			if (wordstring.length() >= 4)
    			{
    				for (j = 0; j <= 3; j++)//判断该单词是否符合前四位为字母
    				{
    					if (wordchar[j] >= 'a' && wordchar[j] <= 'z')
    						pos = 1;
    					else
    					{
    						pos = 0;
    						break;
    					}
    				}
    			}
    			if (pos == 1)
    			{
    				wordnum++;
    				if (w_c.count(wordstring) == 0)
    				{
    					w_c.insert(make_pair(wordstring, w2));
    				}
    				else
    				{
    					w_c[wordstring] += w2;
    				}
    				pos = 0;
    			}
    		}
    		p = strtok_s(NULL, sep, &buf);
    	}
    	for (w_c_iter = w_c.begin(); w_c_iter != w_c.end(); w_c_iter++)
    	{
    		w_c2.push_back(make_pair(w_c_iter->first, w_c_iter->second));
    	}
    	sort(w_c2.begin(), w_c2.end(), Comp);
    }
    void writetxt_n(char *b, int n)
    {
    	char charnum_s[10], wordnum_s[10], linenum_s[10];
    	char num_s[10];
    	string res;
    	char res_c[200000];
    	_itoa_s(charnum + 2, charnum_s, 10);
    	_itoa_s(wordnum, wordnum_s, 10);
    	_itoa_s(linenum, linenum_s, 10);
    	res = res + "characters: " + charnum_s + "
    ";
    	res = res + "words: " + wordnum_s + "
    ";
    	res = res + "lines: " + linenum_s + "
    ";
    	if (n == 0)
    	{
    		n = 10;
    	}
    	if (w_c2.size() >= n)
    	{
    		for (w_c2_iter = w_c2.begin(); w_c2_iter != w_c2.begin() + n; w_c2_iter++)
    		{
    			_itoa_s(w_c2_iter->second, num_s, 10);
    			res = res + "<" + w_c2_iter->first + ">: " + num_s + "
    ";
    		}
    	}
    	else
    	{
    		for (w_c2_iter = w_c2.begin(); w_c2_iter != w_c2.end(); w_c2_iter++)
    		{
    			_itoa_s(w_c2_iter->second, num_s, 10);
    			res = res + "<" + w_c2_iter->first + ">: " + num_s + "
    ";
    		}
    	}
    	strcpy_s(res_c, res.c_str());
    	FILE *fp1;
    	errno_t err;
    	err = fopen_s(&fp1, b, "w");
    	fwrite(res_c, res.length(), 1, fp1);
    }
    
    3.词组统计(divide_m) (writetxt_m)

    void divide_m(int w, int m)
    {
    	size_t length;
    	int cznum = 0;
    	string cz;
    	string wordstring;
    	char wordchar[999];
    	int w2 = 1;
    	int i, j, k = 0;
    	int pos = 0;
    	const char *sep = "./;'[] \<>?:"{}|`~!@#$%^&*()_+-=
    "; //需要分割的字符
    	char *p = NULL;
    	char *buf;
    	p = strtok_s(s, sep, &buf);
    	while (p)
    	{
    		wordstring = p;
    		strcpy_s(wordchar, wordstring.c_str());
    		if (strcmp(wordchar, "Title") == 0)
    		{
    			w2 = w;
    			while (cz_q1.empty() == 0)
    			{
    				cz_q1.pop();
    			}
    			while (cz_q2.empty() == 0)
    			{
    				cz_q2.pop();
    			}
    		} // 出现Title 权重为w;
    		else if (strcmp(wordchar, "Abstract") == 0)
    		{
    			w2 = 1;
    			while (cz_q1.empty() == 0)
    			{
    				cz_q1.pop();
    			}
    			while (cz_q2.empty() == 0)
    			{
    				cz_q2.pop();
    			}
    		}// 出现Abstract 权重为1;
    		else
    		{
    			length = wordstring.length();
    			for (i = 0; i <= length; i++)
    			{
    				if (wordchar[i] >= 'A' && wordchar[i] <= 'Z')
    				{
    
    					wordchar[i] = wordchar[i] + 32;
    				}
    			}
    			wordstring = wordchar;
    			if (wordstring.length() >= 4) //合法pos=1  不合法pos=0
    			{
    				for (j = 0; j <= 3; j++)//判断该单词是否符合前四位为字母
    				{
    					if (wordchar[j] >= 'a' && wordchar[j] <= 'z')
    					{
    						pos = 1;
    					}
    					else
    					{
    						pos = 0;
    						break;
    					}
    				}
    			}
    			else
    			{
    				pos = 0;
    			}
    
    			if (pos == 1)
    			{
    				if (cz_q2.size() == 0)
    				{
    					cz = "";
    				}
    				cz_q1.push(wordstring); //将合法单词入队q1
    				cz_q2.push(wordstring); //将合法单词入队q2
    				if (cz_q2.size() == m)
    				{
    					cz_q1.pop();	//若满足条件称为词组 则q1的首个单词出队
    					for (i = 1; i <= m; i++)   //q2的所有单词存入cz中 用于输出并清空q2
    					{
    						if (i == m)
    						{
    							cz = cz + cz_q2.front();
    							cz_q2.pop();
    						}
    						else
    						{
    							cz = cz + cz_q2.front() + " ";
    							cz_q2.pop();
    						}
    					}
    					if (cz_c.count(cz) == 0) //查询map中是否有该词组 无则将 词组,频率 引入  有则将原有 词组的频率累加
    					{
    						cz_c.insert(make_pair(cz, w2));
    						cz = "";
    					}
    					else
    					{
    						cz_c[cz] += w2;
    						cz = "";
    					}
    					for (j = 1; j <= cz_q1.size(); j++)  //将q1中剩余单词存入pop[]中 同步存入q2
    					{
    						pop[j] = cz_q1.front();
    						cz_q1.pop();
    						cz_q1.push(pop[j]);
    						cz_q2.push(pop[j]);
    					}
    				}
    			}
    			else if (pos == 0) //当遇到非法单词 将两个队列清空
    			{
    				while (cz_q1.empty() == 0)
    				{
    					cz_q1.pop();
    				}
    				while (cz_q2.empty() == 0)
    				{
    					cz_q2.pop();
    				}
    			}
    		}
    		p = strtok_s(NULL, sep, &buf);
    	}
    	for (cz_c_iter = cz_c.begin(); cz_c_iter != cz_c.end(); cz_c_iter++)
    	{
    		cz_c2.push_back(make_pair(cz_c_iter->first, cz_c_iter->second));
    	}
    	sort(cz_c2.begin(), cz_c2.end(), Comp);
    }
    void writetxt_m(char *b)
    {
    	char charnum_s[10], wordnum_s[10], linenum_s[10];
    	char num_s[10];
    	string res;
    	char res_c[200000];
    	_itoa_s(charnum + 2, charnum_s, 10);
    	_itoa_s(wordnum, wordnum_s, 10);
    	_itoa_s(linenum, linenum_s, 10);
    	res = res + "characters: " + charnum_s + "
    ";
    	res = res + "words: " + wordnum_s + "
    ";
    	res = res + "lines: " + linenum_s + "
    ";
    	if (cz_c2.size() >= 10)
    	{
    		for (cz_c2_iter = cz_c2.begin(); cz_c2_iter != cz_c2.begin() + 10; cz_c2_iter++)
    		{
    			_itoa_s(cz_c2_iter->second, num_s, 10);
    			res = res + "<" + cz_c2_iter->first + ">: " + num_s + "
    ";
    		}
    	}
    	else
    	{
    		for (cz_c2_iter = cz_c2.begin(); cz_c2_iter != cz_c2.end(); cz_c2_iter++)
    		{
    			_itoa_s(cz_c2_iter->second, num_s, 10);
    			res = res + "<" + cz_c2_iter->first + ">: " + num_s + "
    ";
    		}
    	}
    	strcpy_s(res_c, res.c_str());
    	FILE *fp1;
    	errno_t err;
    	err = fopen_s(&fp1, b, "w");
    	fwrite(res_c, res.length(), 1, fp1);
    }
    

    七、测试结果

    • 输入
    • 输出

    八、性能分析

    九、附加功能

    • 代码
    import json
    
    from pyecharts import Bar, Line, Scatter, EffectScatter, Grid, WordCloud, Graph, Page
    name = ["learning","with","image","from","network","that","deep","networks","this","video","visual","neural","detection","model","segmentation","multi"]
    value = [2879,2744,2306,1826,1757,1757,1735,1510,1423,1088,1030,952,938,909,889,827]
    
    wordcloud = WordCloud("CVPR热词图谱")
    wordcloud.add("", name, value, word_size_range=[20, 100])
    wordcloud.render()
    

    十、评价队友

    相当棒棒

    十一、学习记录

    第N周 新增代码 累计代码 本周学习时间 累计学习时间(小时) 重要成长
    1 200 200 5 5 对Axure的学习
    5 200 400 12 17 html,css的学习
    5 400 800 8 25 对c中各种函数的学习
  • 相关阅读:
    ubuntu中source insight打不开,报错pagefault的解决方法
    第六次团队作业——Alpha冲刺之事后诸葛亮
    Alpha阶段总结
    第五次团队作业——第一次项目冲刺——Alpha版本
    第四次团队作业——系统设计
    团队项目——需求规格说明书
    第二次团队作业——预则立&&他山之石
    团队项目时间规划2016
    第二次结对编程作业——毕设导师智能匹配
    第二次团队作业——团队选题报告
  • 原文地址:https://www.cnblogs.com/kkyblog/p/9766631.html
Copyright © 2020-2023  润新知