“Document classification test”版本间的差异
来自cslt Wiki
(→Word2vec Test) |
(→Word2vec Test) |
||
第61行: | 第61行: | ||
|- | |- | ||
!20 | !20 | ||
− | |0.781094527 0.537313433 0.572139303 0.830845771 0.76119403 0.452736318 0.611940299 0.646766169 0.860696517 0.672747374 | + | |0.781094527|| 0.537313433|| 0.572139303|| 0.830845771|| 0.76119403|| 0.452736318|| 0.611940299|| 0.646766169|| 0.860696517|| 0.672747374 |
|- | |- | ||
!30 | !30 | ||
− | |0.815920398 0.671641791 0.606965174 0.835820896 0.766169154 0.552238806 0.577114428 0.68159204 0.885572139 0.710337203 | + | |0.815920398|| 0.671641791|| 0.606965174|| 0.835820896|| 0.766169154|| 0.552238806|| 0.577114428|| 0.68159204|| 0.885572139|| 0.710337203 |
|- | |- | ||
!40 | !40 | ||
− | |0.7960199 0.68159204 0.631840796 0.805970149 0.756218905 0.572139303 0.577114428 0.701492537 0.905472637 0.714206744 | + | |0.7960199|| 0.68159204|| 0.631840796|| 0.805970149|| 0.756218905|| 0.572139303|| 0.577114428|| 0.701492537|| 0.905472637|| 0.714206744 |
|- | |- | ||
!50 | !50 | ||
− | |0.805970149 0.691542289 0.641791045 0.800995025 0.751243781 0.552238806 0.651741294 0.656716418 0.910447761 0.718076285 | + | |0.805970149|| 0.691542289|| 0.641791045|| 0.800995025|| 0.751243781|| 0.552238806|| 0.651741294|| 0.656716418|| 0.910447761|| 0.718076285 |
|- | |- | ||
!60 | !60 | ||
− | |0.7960199 0.68159204 0.626865672 0.776119403 0.736318408 0.572139303 0.626865672 0.651741294 0.895522388 0.707020453 | + | |0.7960199|| 0.68159204|| 0.626865672|| 0.776119403|| 0.736318408|| 0.572139303|| 0.626865672|| 0.651741294|| 0.895522388|| 0.707020453 |
|- | |- | ||
!70 | !70 | ||
− | |0.7960199 0.701492537 0.621890547 0.781094527 0.771144279 0.572139303 0.631840796 0.656716418 0.905472637 0.715312327 | + | |0.7960199|| 0.701492537|| 0.621890547|| 0.781094527|| 0.771144279|| 0.572139303|| 0.631840796|| 0.656716418|| 0.905472637|| 0.715312327 |
|- | |- | ||
!80 | !80 | ||
− | |0.7960199 0.686567164 0.626865672 0.805970149 0.776119403 0.582089552 0.631840796 0.676616915 0.905472637 0.720840243 | + | |0.7960199|| 0.686567164|| 0.626865672|| 0.805970149|| 0.776119403|| 0.582089552|| 0.631840796|| 0.676616915|| 0.905472637|| 0.720840243 |
|- | |- | ||
!90 | !90 | ||
− | |0.805970149 0.71641791 0.621890547 0.776119403 0.766169154 0.572139303 0.646766169 0.666666667 0.915422886 0.720840243 | + | |0.805970149|| 0.71641791|| 0.621890547|| 0.776119403|| 0.766169154|| 0.572139303|| 0.646766169|| 0.666666667|| 0.915422886|| 0.720840243 |
|- | |- | ||
!100 | !100 | ||
− | |0.776119403 0.706467662 0.631840796 0.751243781 0.786069652 0.577114428 0.646766169 0.666666667 0.910447761 0.716970702 | + | |0.776119403|| 0.706467662|| 0.631840796|| 0.751243781|| 0.786069652|| 0.577114428|| 0.646766169|| 0.666666667|| 0.910447761|| 0.716970702 |
|- | |- | ||
!110 | !110 | ||
− | |0.771144279 0.71641791 0.656716418 0.741293532 0.76119403 0.597014925 0.606965174 0.691542289 0.910447761 0.716970702 | + | |0.771144279|| 0.71641791|| 0.656716418|| 0.741293532|| 0.76119403|| 0.597014925|| 0.606965174|| 0.691542289|| 0.910447761|| 0.716970702 |
|- | |- | ||
!120 | !120 | ||
− | |0.76119403 0.71641791 0.646766169 0.756218905 0.766169154 0.60199005 0.661691542 0.686567164 0.915422886 0.723604201 | + | |0.76119403|| 0.71641791|| 0.646766169|| 0.756218905|| 0.766169154|| 0.60199005|| 0.661691542|| 0.686567164|| 0.915422886|| 0.723604201 |
|- | |- | ||
!130 | !130 | ||
− | |0.776119403 0.731343284 0.631840796 0.76119403 0.771144279 0.577114428 0.626865672 0.701492537 0.905472637 0.720287452 | + | |0.776119403|| 0.731343284|| 0.631840796|| 0.76119403|| 0.771144279|| 0.577114428|| 0.626865672|| 0.701492537|| 0.905472637|| 0.720287452 |
|- | |- | ||
!140 | !140 | ||
− | |0.76119403 0.746268657 0.63681592 0.736318408 0.786069652 0.587064677 0.651741294 0.68159204 0.900497512 0.720840243 | + | |0.76119403|| 0.746268657|| 0.63681592|| 0.736318408|| 0.786069652|| 0.587064677|| 0.651741294|| 0.68159204|| 0.900497512|| 0.720840243 |
|- | |- | ||
!150 | !150 | ||
− | |0.756218905 0.726368159 0.63681592 0.736318408 0.771144279 0.611940299 0.651741294 0.686567164 0.910447761 0.720840243 | + | |0.756218905|| 0.726368159|| 0.63681592|| 0.736318408|| 0.771144279|| 0.611940299|| 0.651741294|| 0.686567164|| 0.910447761|| 0.720840243 |
|- | |- | ||
!160 | !160 | ||
− | |0.751243781 0.71641791 0.646766169 0.731343284 0.776119403 0.597014925 0.651741294 0.696517413 0.895522388 0.718076285 | + | |0.751243781|| 0.71641791|| 0.646766169|| 0.731343284|| 0.776119403|| 0.597014925|| 0.651741294|| 0.696517413|| 0.895522388|| 0.718076285 |
|- | |- | ||
!170 | !170 | ||
− | |0.756218905 0.741293532 0.661691542 0.731343284 0.766169154 0.60199005 0.651741294 0.666666667 0.900497512 0.71973466 | + | |0.756218905|| 0.741293532|| 0.661691542|| 0.731343284|| 0.766169154|| 0.60199005|| 0.651741294|| 0.666666667|| 0.900497512|| 0.71973466 |
|- | |- | ||
!180 | !180 | ||
− | |0.781094527 0.731343284 0.651741294 0.736318408 0.781094527 0.606965174 0.631840796 0.676616915 0.895522388 0.721393035 | + | |0.781094527|| 0.731343284|| 0.651741294|| 0.736318408|| 0.781094527|| 0.606965174|| 0.631840796|| 0.676616915|| 0.895522388|| 0.721393035 |
|- | |- | ||
!190 | !190 | ||
− | |0.771144279 0.726368159 0.661691542 0.731343284 0.766169154 0.60199005 0.631840796 0.706467662 0.900497512 0.721945826 | + | |0.771144279|| 0.726368159|| 0.661691542|| 0.731343284|| 0.766169154|| 0.60199005|| 0.631840796|| 0.706467662|| 0.900497512|| 0.721945826 |
|- | |- | ||
!200 | !200 | ||
− | |0.771144279 0.736318408 0.641791045 0.706467662 0.771144279 0.606965174 0.611940299 0.71641791 0.900497512 0.718076285 | + | |0.771144279|| 0.736318408|| 0.641791045|| 0.706467662|| 0.771144279|| 0.606965174|| 0.611940299|| 0.71641791|| 0.900497512|| 0.718076285 |
|- | |- | ||
|} | |} |
2014年9月9日 (二) 06:37的版本
目录
Problem And Solve
Document classification of Sougou data
- DATA
- Data from SougouLab [1],using SogouC.reduced(30M)
- 9-Classes:财经,IT,健康,体育,旅游,教育,招聘,文化,军事
- train and test: train(),test(),dev()
- Text preprocessing
- Segment word using wordlist of 9W.(tencent)
- Remove stop word.stop_wordlist is
- Some Tools
- weka
- scw
- google word2ve
- LDA
- class map
C000007 汽车 C000008 财经 C000010 IT C000013 健康 C000014 体育 C000016 旅游 C000020 教育 C000022 招聘 C000023 文化 C000024 军事
VSM Test
- Data
- dimension:9402
- Method
- document reprenstion: use the tf-idf weight for word weight
- classifier: Native Bayes
- Result
财经 | IT | 健康 | 体育 | 旅游 | 教育 | 招聘 | 文化 | 军事 | sum | |
---|---|---|---|---|---|---|---|---|---|---|
ACC-test | 0.72139 | 0.72139 | 0.75124 | 0.82089 | 0.79602 | 0.61194 | 0.70647 | 0.64179 | 0.79104 | 0.72913 |
ACC-train | 0.678 | 0.718 | 0.708 | 0.708 | 0.73 |
LDA Test
Word2vec Test
- Word2vec result
Dimension | 财经 | IT | 健康 | 体育 | 旅游 | 教育 | 招聘 | 文化 | 军事 | sum |
---|---|---|---|---|---|---|---|---|---|---|
10 | 0.766169154 | 0.383084577 | 0.52238806 | 0.820895522 | 0.666666667 | 0.44278607 | 0.567164179 | 0.721393035 | 0.850746269 | 0.637921504 |
20 | 0.781094527 | 0.537313433 | 0.572139303 | 0.830845771 | 0.76119403 | 0.452736318 | 0.611940299 | 0.646766169 | 0.860696517 | 0.672747374 |
30 | 0.815920398 | 0.671641791 | 0.606965174 | 0.835820896 | 0.766169154 | 0.552238806 | 0.577114428 | 0.68159204 | 0.885572139 | 0.710337203 |
40 | 0.7960199 | 0.68159204 | 0.631840796 | 0.805970149 | 0.756218905 | 0.572139303 | 0.577114428 | 0.701492537 | 0.905472637 | 0.714206744 |
50 | 0.805970149 | 0.691542289 | 0.641791045 | 0.800995025 | 0.751243781 | 0.552238806 | 0.651741294 | 0.656716418 | 0.910447761 | 0.718076285 |
60 | 0.7960199 | 0.68159204 | 0.626865672 | 0.776119403 | 0.736318408 | 0.572139303 | 0.626865672 | 0.651741294 | 0.895522388 | 0.707020453 |
70 | 0.7960199 | 0.701492537 | 0.621890547 | 0.781094527 | 0.771144279 | 0.572139303 | 0.631840796 | 0.656716418 | 0.905472637 | 0.715312327 |
80 | 0.7960199 | 0.686567164 | 0.626865672 | 0.805970149 | 0.776119403 | 0.582089552 | 0.631840796 | 0.676616915 | 0.905472637 | 0.720840243 |
90 | 0.805970149 | 0.71641791 | 0.621890547 | 0.776119403 | 0.766169154 | 0.572139303 | 0.646766169 | 0.666666667 | 0.915422886 | 0.720840243 |
100 | 0.776119403 | 0.706467662 | 0.631840796 | 0.751243781 | 0.786069652 | 0.577114428 | 0.646766169 | 0.666666667 | 0.910447761 | 0.716970702 |
110 | 0.771144279 | 0.71641791 | 0.656716418 | 0.741293532 | 0.76119403 | 0.597014925 | 0.606965174 | 0.691542289 | 0.910447761 | 0.716970702 |
120 | 0.76119403 | 0.71641791 | 0.646766169 | 0.756218905 | 0.766169154 | 0.60199005 | 0.661691542 | 0.686567164 | 0.915422886 | 0.723604201 |
130 | 0.776119403 | 0.731343284 | 0.631840796 | 0.76119403 | 0.771144279 | 0.577114428 | 0.626865672 | 0.701492537 | 0.905472637 | 0.720287452 |
140 | 0.76119403 | 0.746268657 | 0.63681592 | 0.736318408 | 0.786069652 | 0.587064677 | 0.651741294 | 0.68159204 | 0.900497512 | 0.720840243 |
150 | 0.756218905 | 0.726368159 | 0.63681592 | 0.736318408 | 0.771144279 | 0.611940299 | 0.651741294 | 0.686567164 | 0.910447761 | 0.720840243 |
160 | 0.751243781 | 0.71641791 | 0.646766169 | 0.731343284 | 0.776119403 | 0.597014925 | 0.651741294 | 0.696517413 | 0.895522388 | 0.718076285 |
170 | 0.756218905 | 0.741293532 | 0.661691542 | 0.731343284 | 0.766169154 | 0.60199005 | 0.651741294 | 0.666666667 | 0.900497512 | 0.71973466 |
180 | 0.781094527 | 0.731343284 | 0.651741294 | 0.736318408 | 0.781094527 | 0.606965174 | 0.631840796 | 0.676616915 | 0.895522388 | 0.721393035 |
190 | 0.771144279 | 0.726368159 | 0.661691542 | 0.731343284 | 0.766169154 | 0.60199005 | 0.631840796 | 0.706467662 | 0.900497512 | 0.721945826 |
200 | 0.771144279 | 0.736318408 | 0.641791045 | 0.706467662 | 0.771144279 | 0.606965174 | 0.611940299 | 0.71641791 | 0.900497512 | 0.718076285 |