1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
| /************************************************************************************************* 函 数 名: SplitStr 功能描述: 将字符串按指定分隔符进行分割 输入参数: 输出参数: 返 回 值: 调用函数: 被调函数: 修改历史: 1.日期: 2017年11月11日 作者: 何健 修改: 创建文件 2.日期: 2019年01月07日 作者: 何健 修改: 忽略了以“-”等分隔的单词 *************************************************************************************************/ inline void SplitStr(string& s, vector<string>& v, const string& c) { string title; transform(s.begin(), s.end(), back_inserter(title), ::tolower); // 全部变为小写字母 s = ""; for (int i = 0; i < title.size(); i++) { if (title[i] >= 'a' && title[i] <= 'z') { s += title[i]; } else s += c; } string::size_type pos1, pos2; pos2 = s.find(c); pos1 = 0; while (string::npos != pos2) { string str = s.substr(pos1, pos2 - pos1); string word = ""; for (int i = 0; i < str.size(); i++) { // 去掉非法字符 if (str[i] >= 'a' && str[i] <= 'z') word += str[i]; } if (word.size() != 0) v.push_back(word); // 不能为空
pos1 = pos2 + c.size(); pos2 = s.find(c, pos1); } if (pos1 != s.length()) { string str = s.substr(pos1); string word = ""; for (int i = 0; i < str.size(); i++) { if (str[i] >= 'a' && str[i] <= 'z') word += str[i]; } if (word.size() != 0) v.push_back(word); } return; }
extern map<string, int> keyword_in;
inline void InvalidKeyword() { string stopwords[] = { "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you’re",\ "you've", "you'll", "you'd", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself",\ "she", "she's", "her", "hers", "herself", "it", "it's", "its", "itself", "they", "them", "their",\ "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "that'll", "these",\ "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",\ "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",\ "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during",\ "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over",\ "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all",\ "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own",\ "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should", "should've",\ "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", "couldn't", "didn",\ "didn't", "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't",\ "ma", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't",\ "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't", "without" }; int len = sizeof(stopwords) / sizeof(string); for (int i = 0; i < len; i++) { keyword_in[stopwords[i]] = 1; } return; }
inline bool Check(string keyword) { if (keyword_in[keyword] == 1) return false; return true; }
inline void DblpGenerate(char* file_in, char* ungraph_out, char* vertex_out, char* original_out) { /* PWgtNet Net = TWgtNet::New(); TWgtNet DBL; const TStr db = "./DataSets/dblp.xml"; DBL.LoadDblpCoAuth("ds"); DBL.LoadDblpCoAuth(db); TDblpLoader *dblp = new TDblpLoader(TStr("./DataSets/dblp.xml")); */ cout << "Start Process dblp Dataset....." << endl; TDblpLoader dblp(file_in); dblp.GetFPosStr(); int authorCount = 1; map<string, int> author_vertex; //方便hash查找,从1开始,默认是0. 存储作者对应的顶点 map<string, vector<string> > author_attributes; //存储作者的全部属性
int cnt = 0; InvalidKeyword(); FILE* fout_ungraph = fopen(ungraph_out, "w"); vector<string> keywords; vector<int> nodes; PUNGraph graph = PUNGraph::New(); //添加图是为了去重边 while (dblp.Next()) { if (cnt++ % 10000 == 0) cout << cnt << endl; //cout << dblp.Year << endl; string titleName = dblp.Title.CStr(); // 20190619改:利用最后是否为句号来去掉home page字段 if (titleName[titleName.size() - 1] != '.') continue; titleName[titleName.size() - 1] = '\0'; //去掉末尾的句号 //cout << titleName << endl; keywords.clear(); SplitStr(titleName, keywords, " ");
nodes.clear(); for (int i = 0; i < dblp.AuthorV.Len(); i++) { //当前文章的所有作者 string authorName = dblp.AuthorV[i].CStr(); //cout << authorName << endl; if (author_vertex[authorName] == 0) { //如果当前作者第一次出现,给他分配一个顶点值 graph->AddNode(authorCount); author_vertex[authorName] = authorCount++; } nodes.push_back(author_vertex[authorName]); for (string keyword : keywords) { if (keyword != " " && keyword != "") { if (Check(keyword)) { //cout << keyword << endl; author_attributes[authorName].push_back(keyword); } } } } if (nodes.size() > 1) { for (int i : nodes) { for (int j : nodes) { if (i != j) { // 去掉自环边 //if (!graph->IsEdge(i, j)) { // 去掉了重边 fprintf(fout_ungraph, "%d\t%d\n", i, j); // graph->AddEdge(i, j); //} } } } } } graph.Clr(); cout << "作者数量 = " << author_vertex.size() << endl; cout << author_attributes.size() << endl; cout << "dblp_author_ungraph.txt over!" << endl;
FILE* fout = fopen(vertex_out, "w"); map<string, int>::iterator it = author_vertex.begin(); for (; it != author_vertex.end(); it++) { fprintf(fout, "%s", it->first.data()); fprintf(fout, "\t"); fprintf(fout, "%d\n", it->second); } cout << "dblp_author_vertex.txt over!" << endl; /* map<string, int> topAttributes; map<string, vector<string> > author_attrs; //最好筛选出来的前20属性数据集 for (map<string, vector<string> >::iterator it = author_attributes.begin(); it != author_attributes.end(); it++) { //if (it->second.size() <= 20) { //可能里面有重复的 // author_attrs[it->first].assign(it->second.begin(), it->second.end()); // continue; //} topAttributes.clear(); for (string attr : it->second) { topAttributes[attr]++; } //把map中元素转存到vector中 vector<PAIR> topAttributes_vec(topAttributes.begin(), topAttributes.end());
//对vector排序 sort(topAttributes_vec.begin(), topAttributes_vec.end(), CmpByValue()); for (int i = 0; i < topAttributes_vec.size() && i < 20; i++) { //限制条件前20条,问题在于没有做词形还原,有点无意义 author_attrs[it->first].push_back(topAttributes_vec[i].first); } topAttributes_vec.clear(); } */ //对上面注释进行修改,先做词形还原,然后进行频繁排序,所以会把所有的属性存储下来,会很大 FILE* fout_attrs = fopen(original_out, "w"); //map<string, vector<string> >::iterator itor = author_attrs.begin(); map<string, vector<string> >::iterator itor = author_attributes.begin(); for (; itor != author_attributes.end(); itor++) { string authorName = itor->first; fprintf(fout_attrs, "%d", author_vertex[authorName]); // %-20s字符串左对齐占20个字符位置 fprintf(fout_attrs, "\t"); bool flag = false; for (string str : itor->second) { //if (str == "" || str == " ") continue; if (!flag) { flag = true; fprintf(fout_attrs, "%s", str.data()); } else fprintf(fout_attrs, ",%s", str.data()); } fprintf(fout_attrs, "\n"); } cout << "dblp_author_attr_original.txt over!" << endl; fclose(fout); fclose(fout_ungraph); fclose(fout_attrs); return; }
|
v1.5.2