问题
- 数据集出现一个顶点有重复属性:词形还原后没有去重
- 异常的home和pag:查看xml发现会有作者的home page出现在title块中,page变成pag是因为我去掉了最后一个字符
#
官网下载
dblp网站:https://dblp.uni-trier.de/
数据集网站:https://dblp.org/xml/
数据集:dblp-2018-04-01.xml.gz
仔细看看会发现dblp数据集网站会添加每天的数据,所以会实时更新数据。所以在下载资料的时候不要下载实时更新的数据集。查看release/文件夹发现,一般会把一个月的稳定版本在1-3日,最好是下载release版本,实时更新的数据集会被覆盖,再也下载不到那种独一无二的数据集。所以推荐下载release版本,方便后面找回。我就遇到了这种问题,当初我用的版本是实时版本,再也找不回数据集了。
由于我的版本数据集找不到xml了,决定使用师姐的数据集dblp-2018-04-01.xml.gz。
1、解压后用snap的库解析程序
1 | /************************************************************************************************* |
1 | 6160001 |
dblp_attr_original.txt 759M
dblp_ungraph.txt 504M
dblp_vertex.txt 47.3M
2、去重边去自环边
1 | #include <bits/stdc++.h> |
正如题目,处理过后效果明显,504M->141M1
2
3
4
5已完成34000000
已完成35000000
原来图边的数量 = 35205510
处理后边的数量 = 9735108
最大的顶点值 = 2081308
3、词形还原
1 | # -*- coding: utf-8 -*- |
1 | 2060000 |
检查词形还原后的结果
包含自建字典,修改python没有修正过来的单词,如subfigures。1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297#include <bits/stdc++.h>
using namespace std;
void SplitString(const string& s, vector<string>& v, const string& c)
{
string::size_type pos1, pos2;
pos2 = s.find(c);
pos1 = 0;
while (string::npos != pos2) {
v.push_back(s.substr(pos1, pos2 - pos1));
pos1 = pos2 + c.size();
pos2 = s.find(c, pos1);
}
if (pos1 != s.length()) v.push_back(s.substr(pos1));
return;
}
void GetPostfix(const char* fileName)
{
int index = 1;
FILE *attributeF = fopen(fileName, "r");
FILE *out = fopen("out.txt", "w");
while (!feof(attributeF)) {
index++;
if (index % 10000 == 0) cout << index << endl;
int node;
char attr[20005];
fscanf(attributeF, "%d\t%s\n", &node, &attr);
fprintf(out, "%d\t", node);
vector<string> vec;
string str = string(attr);
if (str.size() == 0) continue;
SplitString(str, vec, ",");
for (int i = 0; i < vec.size(); i++) {
int len = vec[i].size();
if (vec[i][len - 1] == 's') {
fprintf(out, "%s,", vec[i].data());
}
else if (len > 1 && vec[i][len - 1] == 'd' && vec[i][len - 2] == 'e') {
fprintf(out, "%s,", vec[i].data());
}
else if (len > 2 && vec[i][len - 1] == 'g' && vec[i][len - 2] == 'n' && vec[i][len - 3] == 'i') {
fprintf(out, "%s,", vec[i].data());
}
}
fprintf(out, "\n");
}
fclose(attributeF);
fclose(out);
return;
}
map<string, int> attrCnt;
void Vertex_Attribute(const char* fileName, const char* outFileName)
{
int index = 1;
FILE *attributeF = fopen(fileName, "r");
while (!feof(attributeF)) {
index++;
if (index % 10000 == 0) cout << index << endl;
int node;
char attr[20005];
fscanf(attributeF, "%d\t%s\n", &node, &attr);
vector<string> vec;
string str = string(attr);
if (str.size() == 0) continue;
SplitString(str, vec, ",");
for (int i = 0; i < vec.size(); i++) {
attrCnt[vec[i]]++; // 计数
}
}
index = 1;
FILE *in = fopen(fileName, "r");
FILE *out = fopen(outFileName, "w");
while (!feof(in)) {
index++;
if (index % 10000 == 0) cout << "进度=" << index << endl;
int node;
char attr[20005];
fscanf(in, "%d\t%s\n", &node, &attr);
fprintf(out, "%d\t", node);
vector<string> vec;
string str = string(attr);
if (str.size() == 0) continue;
SplitString(str, vec, ",");
set<string> ss;
for (int i = 0; i < vec.size(); i++) {
int len = vec[i].size();
string tmp = vec[i];
if (vec[i][len - 1] == 's') { // 自建字典,如果去掉s的2倍数量大于加s的单词就删除s
string s = vec[i].substr(0, len - 1);
if (attrCnt[s] * 2 > attrCnt[vec[i]]) tmp = s;
}
else if (len > 1 && vec[i][len - 1] == 'd' && vec[i][len - 2] == 'e') {
string s = vec[i].substr(0, len - 2);
if (attrCnt[s] * 2 > attrCnt[vec[i]]) tmp = s;
}
else if (len > 2 && vec[i][len - 1] == 'g' && vec[i][len - 2] == 'n' && vec[i][len - 3] == 'i') {
string s = vec[i].substr(0, len - 3);
if (attrCnt[s] * 2 > attrCnt[vec[i]]) tmp = s;
}
ss.insert(tmp);
}
bool flag = true;
for (string i : ss) {
if (flag) {
flag = false;
fprintf(out, "%s", i.data());
}
else fprintf(out, ",%s", i.data());
}
fprintf(out, "\n");
}
fclose(in);
fclose(out);
fclose(attributeF);
return;
}
void ComputeFrequency(const char* fileName, const char* outFileName)
{
int sum = 0;
double tmp;
int index = 0;
FILE *attributeF = fopen(fileName, "r");
while (!feof(attributeF)) {
index++;
if (index % 10000 == 0) cout << index << endl;
int node;
char attr[20005];
fscanf(attributeF, "%d\t%s\n", &node, &attr);
vector<string> vec;
string str = string(attr);
if (str.size() == 0) continue;
SplitString(str, vec, ",");
int len = vec.size();
sum += len;
// ???单个顶点有重复的属性
map<string, int> flag;
for (int i = 0; i < len; i++) {
attrCnt[vec[i]]++; // 计数
flag[vec[i]]++;
if (flag[vec[i]] >= 2) {
cout << "顶点出现重复属性:" << index << endl;
}
}
}
set<pair<int, string>, greater<pair<int, string> > > SET;
for (map<string, int>::iterator it = attrCnt.begin(); it != attrCnt.end(); it++) {
SET.emplace(make_pair(it->second, it->first));
}
int cnt = SET.size();
FILE *out = fopen(outFileName, "w");
tmp = sum * 1.0 / index;
cout << tmp << endl;
fprintf(out, "顶点数量=%d\t属性总数量=%d\t平均数量=%.3f\t属性数量=%d\n", index, sum, tmp, cnt);
fprintf(out, "\n属性\t出现次数\t占比例\n");
for (set<pair<int, string> >::iterator it = SET.begin(); it != SET.end(); it++) {
//cout << it->second << ' ' << it->first << endl;
tmp = it->first * 1.0 / index;
fprintf(out, "%s\t%d\t%.3f\n", (it->second).data(), it->first, tmp);
}
fclose(out);
fclose(attributeF);
return;
}
void Transform2Int(const char* fileName, const char* outFileName1, const char* outFileName2)
{
int index = 1, attr_index = 1; // 从1开始好些,0可以当作无
map<string, int> attribute_index;
FILE *attributeF = fopen(fileName, "r");
FILE *out1 = fopen(outFileName1, "w");
while (!feof(attributeF)) {
index++;
if (index % 100000 == 0) cout << index << endl;
int node;
char attr[20005];
fscanf(attributeF, "%d\t%s\n", &node, &attr);
vector<string> vec;
string str = string(attr);
if (str.size() == 0) continue;
SplitString(str, vec, ",");
int len = vec.size();
for (int i = 0; i < len; i++) {
//把str转换成int
if (attribute_index[vec[i]] == 0) {
attribute_index[vec[i]] = attr_index;
fprintf(out1, "%s\t%d\n", vec[i].data(), attr_index);
attr_index++;
}
}
}
index = 1;
FILE *in = fopen(fileName, "r");
FILE *out2 = fopen(outFileName2, "w");
while (!feof(in)) {
index++;
if (index % 10000 == 0) cout << "进度=" << index << endl;
int node;
char attr[20005];
fscanf(in, "%d\t%s\n", &node, &attr);
fprintf(out2, "%d\t", node);
vector<string> vec;
string str = string(attr);
if (str.size() == 0) continue;
SplitString(str, vec, ",");
bool flag = true;
for (int i = 0; i < vec.size(); i++) {
if (flag) {
flag = false;
fprintf(out2, "%d", attribute_index[vec[i]]);
}
else fprintf(out2, ",%d", attribute_index[vec[i]]);
}
fprintf(out2, "\n");
}
fclose(in);
fclose(out1);
fclose(out2);
fclose(attributeF);
return;
}
typedef pair<string, int> psi;
ostream& operator<<(ostream& out, const psi& p) {
return out << p.first << "\t" << p.second;
}
bool cmp(psi a, psi b)
{
return a.second > b.second;
}
void Debug(const char* inFileName, const char* outFileName)
{
int index = 0;
FILE *in = fopen(inFileName, "r");
FILE *out = fopen(outFileName, "w");
while (!feof(in)) {
index++;
if (index % 10000 == 0) cout << index << endl;
int node;
char attr[20005];
fgets(attr, 20005, in);
//cout << attr << ' ' << strlen(attr) << endl; // 长度包含空格和换行符
if (strlen(attr) > 9) { // 本身顶点长度为7,空格,换行符
fputs(attr, out);
}
else {
cout << index << endl;
}
}
fclose(in);
fclose(out);
return;
}
int main()
{
// 去掉无属性的顶点
//const char* fileName1 = "dblp_nltk_attributes.txt";
//const char* fileName2 = "out.txt";
//Debug(fileName1, fileName2);
//return 0;
const char* fileName = "dblp_nltk_attributes.txt";
const char* outFileName = "dblp_attributes.txt";
const char* freFileName = "frequency.txt";
const char* transFileName1 = "dblp_String2Int.txt";
const char* transFileName2 = "dblp_attributes_int.txt";
Vertex_Attribute(fileName, outFileName); // 自建字典检查s、ed、ing(20190623注意要去重复)
ComputeFrequency(outFileName, freFileName); // 计算顶点属性频繁性
Transform2Int(outFileName, transFileName1, transFileName2); // 转换int
return 0;
}
出现文件读不完情况???居然是有个顶点没有一个属性造成的。
使用fgets和fputs判断字符长度不超过9的删除。1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25void Debug(const char* inFileName, const char* outFileName)
{
int index = 0;
FILE *in = fopen(inFileName, "r");
FILE *out = fopen(outFileName, "w");
while (!feof(in)) {
index++;
if (index % 10000 == 0) cout << index << endl;
int node;
char attr[20005];
fgets(attr, 20005, in);
//cout << attr << ' ' << strlen(attr) << endl; // 长度包含空格和换行符
if (strlen(attr) > 9) { // 本身顶点长度为7,空格,换行符
fputs(attr, out);
}
else {
cout << index << endl;
}
}
fclose(in);
fclose(out);
return;
}
1 | 进度=2070000 |
发现一个顶点有重复的属性在自建字典词形还原的时候出现这种情况。但处理的还不是完美,应该在python词形还原的时候不应该取前20,在这里取前20。时间紧迫,python处理大概需要半小时。
运行,建立超图和属性关键字索引
1 | [hejian@sklse ETAttriOnLinux]$ ./main_by_hejian |
#
难受,以前因为没有后面的两种方法,所以选取设计的ATCImprove方法进行比较,想改缺没有时间了,就这样吧。