福大软工1816 · 第五次作业——结对作业2

一、博客链接和Github链接

郑孔宇 俞凯欣

github项目地址

二、具体分工：

基本功能部分：郑孔宇
爬虫工具、测试及附加题部分：俞凯欣

三、PSP表格：

PSP2.1	Personal Software Process Stages	预估耗时（分钟）	实际耗时（分钟）
Planning	计划	10	10
· Estimate	· 估计这个任务需要多少时间	540	620
Development	开发	0	0
· Analysis	· 需求分析 (包括学习新技术)	120	60
· Design Spec	· 生成设计文档	20	20
· Design Review	· 设计复审	20	20
· Coding Standard	· 代码规范 (为目前的开发制定合适的规范)	0	0
· Design	· 具体设计	20	20
· Coding	· 具体编码	220	360
· Code Review	· 代码复审	40	40
· Test	· 测试（自我测试，修改代码，提交修改）	60	60
Reporting	报告	0	0
· Test Repor	· 测试报告	0	0
· Size Measurement	· 计算工作量	0	0
· Postmortem & Process Improvement Plan	· 事后总结, 并提出过程改进计划	30	3

四、爬虫工具

爬虫工具使用的八爪鱼，通过点击需要爬取的信息，来获取相同类型的信息进行选定操作，然后制作流程图来完成爬虫，爬取完毕后再进行导出操作即可。

五、需求分析

六、功能实现

1.主函数

获取命令行中的所有指令，并执行相关函数

int main(int args, char* argv[])
{
	char* a = NULL;
	char* b = NULL;
	char* c = NULL;
	char* d = NULL;
	char* e = NULL;
	string cstr, dstr, estr;
	int i;
	int w = 0, m = 0, n = 0;
	for (i = 0; i < args; i++)
	{
		if (strcmp(argv[i], "-i") == 0)  //检测-i指令
		{
			a = argv[i + 1];
		}
		if (strcmp(argv[i], "-o") == 0) //检测-o指令
		{
			b = argv[i + 1];
		}
		if (strcmp(argv[i], "-w") == 0) //检测-w指令
		{
			c = argv[i + 1];
			cstr = c;
			w = atoi(cstr.c_str());
		}
		if (strcmp(argv[i], "-n") == 0) //检测-n指令
		{
			d = argv[i + 1];
			dstr = d;
			n = atoi(dstr.c_str()); 
		}
		if (strcmp(argv[i], "-m") == 0) //检测-m指令
		{
			e = argv[i + 1];
			estr = e;
			m = atoi(estr.c_str());
		}
	}
	readtxt(a); //读取文件并获取所有字符数
	divide_n(w); //获取行数 单词数 排除Title: Abstract: 和编号后的字符数，并分割、存入和排序单词
	if (m == 0)
	{		
		writetxt_n(b, n); //输出无-m时候的格式
	}
	else
	{
		readtxt2(a); //读取文件不改变字符数,用于重新分割
		divide_m(w, m); //分割单词若满足词组条件则存入并排序
		writetxt_m(b); //输出有-m时候的格式
	}
}

2.词频统计(divide_n) (writetxt_n)

void divide_n(int w)
{
	size_t length;
	string wordstring;
	char wordchar[999];
	int w2 = 1;
	int i, j, k = 0;
	int pos = 0;
	const char *sep = "./;'[] \<>?:"{}|`~!@#$%^&*()_+-=
"; //需要分割的字符
	char *p;
	char *buf;
	p = strtok_s(s, sep, &buf);
	while (p)
	{
		wordstring = p;
		strcpy_s(wordchar, wordstring.c_str());
		if (strcmp(wordchar, "Title") == 0)
		{
			charnum -= 11;
			linenum++;
			w2 = w;
		} // 出现Title 权重为w；
		else if (strcmp(wordchar, "Abstract") == 0)
		{
			charnum -= 10;
			linenum++;
			w2 = 1;
		}// 出现Abstract 权重为1；
		else
		{
			length = wordstring.length();
			for (i = 0; i <= length; i++)
			{
				if (wordchar[i] >= 'A' && wordchar[i] <= 'Z')
				{
					wordchar[i] = wordchar[i] + 32;
				}
			}
			wordstring = wordchar;
			if (wordstring.length() >= 4)
			{
				for (j = 0; j <= 3; j++)//判断该单词是否符合前四位为字母
				{
					if (wordchar[j] >= 'a' && wordchar[j] <= 'z')
						pos = 1;
					else
					{
						pos = 0;
						break;
					}
				}
			}
			if (pos == 1)
			{
				wordnum++;
				if (w_c.count(wordstring) == 0)
				{
					w_c.insert(make_pair(wordstring, w2));
				}
				else
				{
					w_c[wordstring] += w2;
				}
				pos = 0;
			}
		}
		p = strtok_s(NULL, sep, &buf);
	}
	for (w_c_iter = w_c.begin(); w_c_iter != w_c.end(); w_c_iter++)
	{
		w_c2.push_back(make_pair(w_c_iter->first, w_c_iter->second));
	}
	sort(w_c2.begin(), w_c2.end(), Comp);
}
void writetxt_n(char *b, int n)
{
	char charnum_s[10], wordnum_s[10], linenum_s[10];
	char num_s[10];
	string res;
	char res_c[200000];
	_itoa_s(charnum + 2, charnum_s, 10);
	_itoa_s(wordnum, wordnum_s, 10);
	_itoa_s(linenum, linenum_s, 10);
	res = res + "characters: " + charnum_s + "
";
	res = res + "words: " + wordnum_s + "
";
	res = res + "lines: " + linenum_s + "
";
	if (n == 0)
	{
		n = 10;
	}
	if (w_c2.size() >= n)
	{
		for (w_c2_iter = w_c2.begin(); w_c2_iter != w_c2.begin() + n; w_c2_iter++)
		{
			_itoa_s(w_c2_iter->second, num_s, 10);
			res = res + "<" + w_c2_iter->first + ">: " + num_s + "
";
		}
	}
	else
	{
		for (w_c2_iter = w_c2.begin(); w_c2_iter != w_c2.end(); w_c2_iter++)
		{
			_itoa_s(w_c2_iter->second, num_s, 10);
			res = res + "<" + w_c2_iter->first + ">: " + num_s + "
";
		}
	}
	strcpy_s(res_c, res.c_str());
	FILE *fp1;
	errno_t err;
	err = fopen_s(&fp1, b, "w");
	fwrite(res_c, res.length(), 1, fp1);
}

3.词组统计(divide_m) (writetxt_m)

void divide_m(int w, int m)
{
	size_t length;
	int cznum = 0;
	string cz;
	string wordstring;
	char wordchar[999];
	int w2 = 1;
	int i, j, k = 0;
	int pos = 0;
	const char *sep = "./;'[] \<>?:"{}|`~!@#$%^&*()_+-=
"; //需要分割的字符
	char *p = NULL;
	char *buf;
	p = strtok_s(s, sep, &buf);
	while (p)
	{
		wordstring = p;
		strcpy_s(wordchar, wordstring.c_str());
		if (strcmp(wordchar, "Title") == 0)
		{
			w2 = w;
			while (cz_q1.empty() == 0)
			{
				cz_q1.pop();
			}
			while (cz_q2.empty() == 0)
			{
				cz_q2.pop();
			}
		} // 出现Title 权重为w；
		else if (strcmp(wordchar, "Abstract") == 0)
		{
			w2 = 1;
			while (cz_q1.empty() == 0)
			{
				cz_q1.pop();
			}
			while (cz_q2.empty() == 0)
			{
				cz_q2.pop();
			}
		}// 出现Abstract 权重为1；
		else
		{
			length = wordstring.length();
			for (i = 0; i <= length; i++)
			{
				if (wordchar[i] >= 'A' && wordchar[i] <= 'Z')
				{

					wordchar[i] = wordchar[i] + 32;
				}
			}
			wordstring = wordchar;
			if (wordstring.length() >= 4) //合法pos=1  不合法pos=0
			{
				for (j = 0; j <= 3; j++)//判断该单词是否符合前四位为字母
				{
					if (wordchar[j] >= 'a' && wordchar[j] <= 'z')
					{
						pos = 1;
					}
					else
					{
						pos = 0;
						break;
					}
				}
			}
			else
			{
				pos = 0;
			}

			if (pos == 1)
			{
				if (cz_q2.size() == 0)
				{
					cz = "";
				}
				cz_q1.push(wordstring); //将合法单词入队q1
				cz_q2.push(wordstring); //将合法单词入队q2
				if (cz_q2.size() == m)
				{
					cz_q1.pop();	//若满足条件称为词组 则q1的首个单词出队
					for (i = 1; i <= m; i++)   //q2的所有单词存入cz中 用于输出并清空q2
					{
						if (i == m)
						{
							cz = cz + cz_q2.front();
							cz_q2.pop();
						}
						else
						{
							cz = cz + cz_q2.front() + " ";
							cz_q2.pop();
						}
					}
					if (cz_c.count(cz) == 0) //查询map中是否有该词组 无则将 词组，频率 引入  有则将原有 词组的频率累加
					{
						cz_c.insert(make_pair(cz, w2));
						cz = "";
					}
					else
					{
						cz_c[cz] += w2;
						cz = "";
					}
					for (j = 1; j <= cz_q1.size(); j++)  //将q1中剩余单词存入pop[]中 同步存入q2
					{
						pop[j] = cz_q1.front();
						cz_q1.pop();
						cz_q1.push(pop[j]);
						cz_q2.push(pop[j]);
					}
				}
			}
			else if (pos == 0) //当遇到非法单词 将两个队列清空
			{
				while (cz_q1.empty() == 0)
				{
					cz_q1.pop();
				}
				while (cz_q2.empty() == 0)
				{
					cz_q2.pop();
				}
			}
		}
		p = strtok_s(NULL, sep, &buf);
	}
	for (cz_c_iter = cz_c.begin(); cz_c_iter != cz_c.end(); cz_c_iter++)
	{
		cz_c2.push_back(make_pair(cz_c_iter->first, cz_c_iter->second));
	}
	sort(cz_c2.begin(), cz_c2.end(), Comp);
}
void writetxt_m(char *b)
{
	char charnum_s[10], wordnum_s[10], linenum_s[10];
	char num_s[10];
	string res;
	char res_c[200000];
	_itoa_s(charnum + 2, charnum_s, 10);
	_itoa_s(wordnum, wordnum_s, 10);
	_itoa_s(linenum, linenum_s, 10);
	res = res + "characters: " + charnum_s + "
";
	res = res + "words: " + wordnum_s + "
";
	res = res + "lines: " + linenum_s + "
";
	if (cz_c2.size() >= 10)
	{
		for (cz_c2_iter = cz_c2.begin(); cz_c2_iter != cz_c2.begin() + 10; cz_c2_iter++)
		{
			_itoa_s(cz_c2_iter->second, num_s, 10);
			res = res + "<" + cz_c2_iter->first + ">: " + num_s + "
";
		}
	}
	else
	{
		for (cz_c2_iter = cz_c2.begin(); cz_c2_iter != cz_c2.end(); cz_c2_iter++)
		{
			_itoa_s(cz_c2_iter->second, num_s, 10);
			res = res + "<" + cz_c2_iter->first + ">: " + num_s + "
";
		}
	}
	strcpy_s(res_c, res.c_str());
	FILE *fp1;
	errno_t err;
	err = fopen_s(&fp1, b, "w");
	fwrite(res_c, res.length(), 1, fp1);
}

七、测试结果

输入
输出

八、性能分析

九、附加功能

代码

import json

from pyecharts import Bar, Line, Scatter, EffectScatter, Grid, WordCloud, Graph, Page
name = ["learning","with","image","from","network","that","deep","networks","this","video","visual","neural","detection","model","segmentation","multi"]
value = [2879,2744,2306,1826,1757,1757,1735,1510,1423,1088,1030,952,938,909,889,827]

wordcloud = WordCloud("CVPR热词图谱")
wordcloud.add("", name, value, word_size_range=[20, 100])
wordcloud.render()

十、评价队友

相当棒棒

十一、学习记录

第N周	新增代码	累计代码	本周学习时间	累计学习时间(小时)	重要成长
1	200	200	5	5	对Axure的学习
5	200	400	12	17	html,css的学习
5	400	800	8	25	对c中各种函数的学习

相关阅读:
ubuntu中source insight打不开，报错pagefault的解决方法
 第六次团队作业——Alpha冲刺之事后诸葛亮
 Alpha阶段总结
 第五次团队作业——第一次项目冲刺——Alpha版本
 第四次团队作业——系统设计
 团队项目——需求规格说明书
 第二次团队作业——预则立&&他山之石
 团队项目时间规划2016
第二次结对编程作业——毕设导师智能匹配
 第二次团队作业——团队选题报告
原文地址：https://www.cnblogs.com/kkyblog/p/9766631.html

福大软工1816 · 第五次作业——结对作业2

一、博客链接和Github链接

郑孔宇 俞凯欣

github项目地址

二、具体分工：

三、PSP表格：

四、爬虫工具

爬虫工具使用的八爪鱼，通过点击需要爬取的信息，来获取相同类型的信息进行选定操作，然后制作流程图来完成爬虫，爬取完毕后再进行导出操作即可。

五、需求分析

六、功能实现

1.主函数

2.词频统计(divide_n) (writetxt_n)

3.词组统计(divide_m) (writetxt_m)

七、测试结果

八、性能分析

九、附加功能

十、评价队友

相当棒棒

十一、学习记录

郑孔宇俞凯欣