From 69e5f771ec1fe89f90d923b4549f98ba3e9b5b5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Carlos=20Cuevas?= Date: Fri, 3 Jul 2015 10:48:00 +0200 Subject: [PATCH] Initial commit --- .gitignore | 3 + README.md | 0 main.go | 208 +++++ src/tfidf/stop-words/stop-words-arabic.txt | 162 ++++ src/tfidf/stop-words/stop-words-catalian.txt | 126 +++ src/tfidf/stop-words/stop-words-czech1.txt | 138 ++++ src/tfidf/stop-words/stop-words-czech2.txt | 172 ++++ src/tfidf/stop-words/stop-words-dutch.txt | 48 ++ src/tfidf/stop-words/stop-words-english1.txt | 635 +++++++++++++++ src/tfidf/stop-words/stop-words-english2.txt | 174 ++++ .../stop-words/stop-words-english3-google.txt | 32 + src/tfidf/stop-words/stop-words-english4.txt | 671 ++++++++++++++++ src/tfidf/stop-words/stop-words-english5.txt | 319 ++++++++ src/tfidf/stop-words/stop-words-finnish.txt | 747 ++++++++++++++++++ src/tfidf/stop-words/stop-words-french.txt | 126 +++ src/tfidf/stop-words/stop-words-german.txt | 129 +++ src/tfidf/stop-words/stop-words-greek.txt | 79 ++ src/tfidf/stop-words/stop-words-hungarian.txt | 35 + src/tfidf/stop-words/stop-words-italian.txt | 134 ++++ src/tfidf/stop-words/stop-words-norwegian.txt | 119 +++ src/tfidf/stop-words/stop-words-polish1.txt | 138 ++++ src/tfidf/stop-words/stop-words-polish2.txt | 272 +++++++ src/tfidf/stop-words/stop-words-portugese.txt | 147 ++++ src/tfidf/stop-words/stop-words-russian.txt | 421 ++++++++++ src/tfidf/stop-words/stop-words-slovak.txt | 173 ++++ src/tfidf/stop-words/stop-words-slovak2.txt | 106 +++ src/tfidf/stop-words/stop-words-spanish.txt | 181 +++++ src/tfidf/stop-words/stop-words-swedish.txt | 386 +++++++++ src/tfidf/stop-words/stop-words-turkish.txt | 114 +++ src/tfidf/tfidf.go | 182 +++++ test.go | 15 + 31 files changed, 6192 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 main.go create mode 100644 src/tfidf/stop-words/stop-words-arabic.txt create mode 100644 src/tfidf/stop-words/stop-words-catalian.txt create mode 100644 src/tfidf/stop-words/stop-words-czech1.txt create mode 100644 src/tfidf/stop-words/stop-words-czech2.txt create mode 100644 src/tfidf/stop-words/stop-words-dutch.txt create mode 100644 src/tfidf/stop-words/stop-words-english1.txt create mode 100644 src/tfidf/stop-words/stop-words-english2.txt create mode 100644 src/tfidf/stop-words/stop-words-english3-google.txt create mode 100644 src/tfidf/stop-words/stop-words-english4.txt create mode 100644 src/tfidf/stop-words/stop-words-english5.txt create mode 100644 src/tfidf/stop-words/stop-words-finnish.txt create mode 100644 src/tfidf/stop-words/stop-words-french.txt create mode 100644 src/tfidf/stop-words/stop-words-german.txt create mode 100644 src/tfidf/stop-words/stop-words-greek.txt create mode 100644 src/tfidf/stop-words/stop-words-hungarian.txt create mode 100644 src/tfidf/stop-words/stop-words-italian.txt create mode 100644 src/tfidf/stop-words/stop-words-norwegian.txt create mode 100644 src/tfidf/stop-words/stop-words-polish1.txt create mode 100644 src/tfidf/stop-words/stop-words-polish2.txt create mode 100644 src/tfidf/stop-words/stop-words-portugese.txt create mode 100644 src/tfidf/stop-words/stop-words-russian.txt create mode 100644 src/tfidf/stop-words/stop-words-slovak.txt create mode 100644 src/tfidf/stop-words/stop-words-slovak2.txt create mode 100644 src/tfidf/stop-words/stop-words-spanish.txt create mode 100644 src/tfidf/stop-words/stop-words-swedish.txt create mode 100644 src/tfidf/stop-words/stop-words-turkish.txt create mode 100644 src/tfidf/tfidf.go create mode 100644 test.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0be49ff --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.swp +main +websites.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/main.go b/main.go new file mode 100644 index 0000000..463dc99 --- /dev/null +++ b/main.go @@ -0,0 +1,208 @@ +package main + +import ( + "fmt" + "io/ioutil" + "net/http" + "regexp" + "strings" + "os" + "tfidf" +) + +func getLinks(text, mainUrl string) []string { + result := make([]string, 0) + hrefLocator, _ := regexp.Compile("href=(\"|').*?(\"|')") + + elems := hrefLocator.FindAllString(text, -1) + + for _, link := range elems { + temp := strings.Replace(link, "href=", "", -1) + temp = strings.Replace(temp, "\"", "", -1) + temp = strings.Replace(temp, "'", "", -1) + + switch { + case strings.Contains(temp, ".png"): + continue + case strings.Contains(temp, ".jpg"): + continue + case strings.Contains(temp, ".gif"): + continue + case strings.Contains(temp, ".xml"): + continue + case strings.Contains(temp, ".css"): + continue + case strings.Contains(temp, ".js"): + continue + case !strings.Contains(temp, mainUrl): + continue + default: + result = append(result, temp) + } + + } + + return result +} + +func fetchAndGetLinks(url string, output chan []string) { + result := make([]string, 0) + + resp, err := http.Get(url) + + if err != nil { + output <- result + return + } + + body, err := ioutil.ReadAll(resp.Body) + + defer resp.Body.Close() + + if err != nil { + output <- result + return + } + + links := getLinks(string(body), url) + output <- links + +} + +func fetchText(url string, output chan []string) { + resp, err := http.Get(url) + if err != nil { + output <- nil + return + } + + defer resp.Body.Close() + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + output <- nil + return + } + + if strings.Contains(string(body), " 1 { + for count := 1; count < len(os.Args); count++ { + if strings.Contains(os.Args[count], option) { + result = os.Args[count] + result = strings.Replace(result, option, "", -1) + result = strings.TrimSpace(result) + + if result == "" { + result = os.Args[count + 1] + result = strings.Replace(result, option, "", -1) + result = strings.TrimSpace(result) + } + break + } + } + } + + return result +} + +func getWebsitesArgument() string { + return getCommandLineArgument("-w", "websites.txt") +} + +func getStopWordsArgument() string { + return getCommandLineArgument("-s", "english1") +} + +func main() { + var searchTerm string + var matrix tfidf.TfIdfMatrix + + fmt.Println("Init the TfIdf with stopwords") + + stopwordsLanguage := getStopWordsArgument() + matrix.LoadStopWords(fmt.Sprintf("./src/tfidf/stop-words/stop-words-%s.txt", stopwordsLanguage)) + + websites := loadWebsites(getWebsitesArgument()) + + input, sites := getDocuments(websites) + + matrix.TfIdf(input) + + running := true + + for running { + fmt.Println("Introduzca término de búsqueda o q para salir:") + fmt.Scanf("%s", &searchTerm) + + if searchTerm != "q" { + results, percentages := matrix.SearchDocumentsByWord(searchTerm) + fmt.Println("Resultados:") + for c, value := range results { + fmt.Printf("%d: %.2f%% - %s\n", c, percentages[c] * 10.0, sites[value]) + } + } else { + running = false + } + } +} diff --git a/src/tfidf/stop-words/stop-words-arabic.txt b/src/tfidf/stop-words/stop-words-arabic.txt new file mode 100644 index 0000000..c547d07 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-arabic.txt @@ -0,0 +1,162 @@ +فى +في +كل +لم +لن +له +من +هو +هي +قوة +كما +لها +منذ +وقد +ولا +نفسه +لقاء +مقابل +هناك +وقال +وكان +نهاية +وقالت +وكانت +للامم +فيه +كلم +لكن +وفي +وقف +ولم +ومن +وهو +وهي +يوم +فيها +منها +مليار +لوكالة +يكون +يمكن +مليون +حيث +اكد +الا +اما +امس +السابق +التى +التي +اكثر +ايار +ايضا +ثلاثة +الذاتي +الاخيرة +الثاني +الثانية +الذى +الذي +الان +امام +ايام +خلال +حوالى +الذين +الاول +الاولى +بين +ذلك +دون +حول +حين +الف +الى +انه +اول +ضمن +انها +جميع +الماضي +الوقت +المقبل +اليوم +ـ +ف +و +و6 +قد +لا +ما +مع +مساء +هذا +واحد +واضاف +واضافت +فان +قبل +قال +كان +لدى +نحو +هذه +وان +واكد +كانت +واوضح +مايو +ب +ا +أ +، +عشر +عدد +عدة +عشرة +عدم +عام +عاما +عن +عند +عندما +على +عليه +عليها +زيارة +سنة +سنوات +تم +ضد +بعد +بعض +اعادة +اعلنت +بسبب +حتى +اذا +احد +اثر +برس +باسم +غدا +شخصا +صباح +اطار +اربعة +اخرى +بان +اجل +غير +بشكل +حاليا +بن +به +ثم +اف +ان +او +اي +بها +صفر diff --git a/src/tfidf/stop-words/stop-words-catalian.txt b/src/tfidf/stop-words/stop-words-catalian.txt new file mode 100644 index 0000000..7479f08 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-catalian.txt @@ -0,0 +1,126 @@ +a +abans +algun +alguna +algunes +alguns +altre +amb +ambdós +anar +ans +aquell +aquelles +aquells +aquí +bastant +bé +cada +com +consegueixo +conseguim +conseguir +consigueix +consigueixen +consigueixes +dalt +de +des de +dins +el +elles +ells +els +en +ens +entre +era +erem +eren +eres +es +és +éssent +està +estan +estat +estava +estem +esteu +estic +ets +fa +faig +fan +fas +fem +fer +feu +fi +haver +i +inclòs +jo +la +les +llarg +llavors +mentre +meu +mode +molt +molts +nosaltres +o +on +per +per +per que +però +perquè +podem +poden +poder +podeu +potser +primer +puc +quan +quant +qui +sabem +saben +saber +sabeu +sap +saps +sense +ser +seu +seus +si +soc +solament +sols +som +sota +també +te +tene +tenim +tenir +teniu +teu +tinc +tot +últim +un +un +una +unes +uns +ús +va +vaig +van +vosaltres diff --git a/src/tfidf/stop-words/stop-words-czech1.txt b/src/tfidf/stop-words/stop-words-czech1.txt new file mode 100644 index 0000000..6582cd9 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-czech1.txt @@ -0,0 +1,138 @@ +aby +aj +ale +ani +asi +az +bez +bude +budem +budes +by +byl +byla +byli +bylo +byt +ci +clanek +clanku +clanky +co +coz +cz +dalsi +design +dnes +do +email +ho +jak +jako +je +jeho +jej +jeji +jejich +jen +jeste +ji +jine +jiz +jsem +jses +jsme +jsou +jste +kam +kde +kdo +kdyz +ke +ktera +ktere +kteri +kterou +ktery +ma +mate +mezi +mi +mit +muj +muze +na +nad +nam +napiste +nas +nasi +ne +nebo +nejsou +neni +nez +nic +nove +novy +od +pak +po +pod +podle +pokud +pouze +prave +pred +pres +pri +pro +proc +proto +protoze +prvni +pta +re +si +strana +sve +svych +svym +svymi +ta +tak +take +takze +tato +tedy +tema +ten +tento +teto +tim +timto +tipy +to +tohle +toho +tohoto +tom +tomto +tomuto +tu +tuto +ty +tyto +uz +vam +vas +vase +ve +vice +vsak +za +zda +zde +ze +zpet +zpravy diff --git a/src/tfidf/stop-words/stop-words-czech2.txt b/src/tfidf/stop-words/stop-words-czech2.txt new file mode 100644 index 0000000..809e9fe --- /dev/null +++ b/src/tfidf/stop-words/stop-words-czech2.txt @@ -0,0 +1,172 @@ +a +aby +aj +ale +ani +aniž +ano +asi +až +bez +bude +budem +budeš +by +byl +byla +byli +bylo +být +co +což +cz +či +článek +článku +články +další +dnes +do +ho +i +já +jak +jako +je +jeho +jej +její +jejich +jen +jenž +ještě +ji +jiné +již +jsem +jseš +jsme +jsou +jšte +k +kam +každý +kde +kdo +když +ke +která +které +kterou +který +kteři +ku +ma +máte +me +mě +mezi +mi +mít +mně +mnou +můj +může +my +na +ná +nad +nám +napište +náš +naši +ne +nebo +nechť +nejsou +není +než +ní +nic +nové +nový +o +od +ode +on +pak +po +pod +podle +pokud +pouze +práve +pro +proč +proto +protože +první +před +přede +přes +při +pta +re +s +se +si +sice +strana +své +svůj +svých +svým +svými +ta +tak +také +takže +tato +te +tě +tedy +těma +ten +tento +této +tím +tímto +tipy +to +to +tohle +toho +tohoto +tom +tomto +tomuto +toto +tu +tuto +tvůj +ty +tyto +u +už +v +vám +váš +vaše +ve +více +však +všechen +vy +z +za +zda +zde +ze +zpět +zprávy +že diff --git a/src/tfidf/stop-words/stop-words-dutch.txt b/src/tfidf/stop-words/stop-words-dutch.txt new file mode 100644 index 0000000..37c2ba9 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-dutch.txt @@ -0,0 +1,48 @@ +aan +af +al +als +bij +dan +dat +die +dit +een +en +er +had +heb +hem +het +hij +hoe +hun +ik +in +is +je +kan +me +men +met +mij +nog +nu +of +ons +ook +te +tot +uit +van +was +wat +we +wel +wij +zal +ze +zei +zij +zo +zou diff --git a/src/tfidf/stop-words/stop-words-english1.txt b/src/tfidf/stop-words/stop-words-english1.txt new file mode 100644 index 0000000..6e8c2fb --- /dev/null +++ b/src/tfidf/stop-words/stop-words-english1.txt @@ -0,0 +1,635 @@ +able +about +above +abroad +according +accordingly +across +actually +adj +after +afterwards +again +against +ago +ahead +ain't +all +allow +allows +almost +alone +along +alongside +already +also +although +always +am +amid +amidst +among +amongst +an +and +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +apart +appear +appreciate +appropriate +are +aren't +around +as +a's +aside +ask +asking +associated +at +available +away +awfully +back +backward +backwards +be +became +because +become +becomes +becoming +been +before +beforehand +begin +behind +being +believe +below +beside +besides +best +better +between +beyond +both +brief +but +by +came +can +cannot +cant +can't +caption +cause +causes +certain +certainly +changes +clearly +c'mon +co +co. +com +come +comes +concerning +consequently +consider +considering +contain +containing +contains +corresponding +could +couldn't +course +c's +currently +dare +daren't +definitely +described +despite +did +didn't +different +directly +do +does +doesn't +doing +done +don't +down +downwards +during +each +edu +eg +eight +eighty +either +else +elsewhere +end +ending +enough +entirely +especially +et +etc +even +ever +evermore +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +fairly +far +farther +few +fewer +fifth +first +five +followed +following +follows +for +forever +former +formerly +forth +forward +found +four +from +further +furthermore +get +gets +getting +given +gives +go +goes +going +gone +got +gotten +greetings +had +hadn't +half +happens +hardly +has +hasn't +have +haven't +having +he +he'd +he'll +hello +help +hence +her +here +hereafter +hereby +herein +here's +hereupon +hers +herself +he's +hi +him +himself +his +hither +hopefully +how +howbeit +however +hundred +i'd +ie +if +ignored +i'll +i'm +immediate +in +inasmuch +inc +inc. +indeed +indicate +indicated +indicates +inner +inside +insofar +instead +into +inward +is +isn't +it +it'd +it'll +its +it's +itself +i've +just +k +keep +keeps +kept +know +known +knows +last +lately +later +latter +latterly +least +less +lest +let +let's +like +liked +likely +likewise +little +look +looking +looks +low +lower +ltd +made +mainly +make +makes +many +may +maybe +mayn't +me +mean +meantime +meanwhile +merely +might +mightn't +mine +minus +miss +more +moreover +most +mostly +mr +mrs +much +must +mustn't +my +myself +name +namely +nd +near +nearly +necessary +need +needn't +needs +neither +never +neverf +neverless +nevertheless +new +next +nine +ninety +no +nobody +non +none +nonetheless +noone +no-one +nor +normally +not +nothing +notwithstanding +novel +now +nowhere +obviously +of +off +often +oh +ok +okay +old +on +once +one +ones +one's +only +onto +opposite +or +other +others +otherwise +ought +oughtn't +our +ours +ourselves +out +outside +over +overall +own +particular +particularly +past +per +perhaps +placed +please +plus +possible +presumably +probably +provided +provides +que +quite +qv +rather +rd +re +really +reasonably +recent +recently +regarding +regardless +regards +relatively +respectively +right +round +said +same +saw +say +saying +says +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +shan't +she +she'd +she'll +she's +should +shouldn't +since +six +so +some +somebody +someday +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specified +specify +specifying +still +sub +such +sup +sure +take +taken +taking +tell +tends +th +than +thank +thanks +thanx +that +that'll +thats +that's +that've +the +their +theirs +them +themselves +then +thence +there +thereafter +thereby +there'd +therefore +therein +there'll +there're +theres +there's +thereupon +there've +these +they +they'd +they'll +they're +they've +thing +things +think +third +thirty +this +thorough +thoroughly +those +though +three +through +throughout +thru +thus +till +to +together +too +took +toward +towards +tried +tries +truly +try +trying +t's +twice +two +un +under +underneath +undoing +unfortunately +unless +unlike +unlikely +until +unto +up +upon +upwards +us +use +used +useful +uses +using +usually +v +value +various +versus +very +via +viz +vs +want +wants +was +wasn't +way +we +we'd +welcome +well +we'll +went +were +we're +weren't +we've +what +whatever +what'll +what's +what've +when +whence +whenever +where +whereafter +whereas +whereby +wherein +where's +whereupon +wherever +whether +which +whichever +while +whilst +whither +who +who'd +whoever +whole +who'll +whom +whomever +who's +whose +why +will +willing +wish +with +within +without +wonder +won't +would +wouldn't +yes +yet +you +you'd +you'll +your +you're +yours +yourself +yourselves +you've +zero diff --git a/src/tfidf/stop-words/stop-words-english2.txt b/src/tfidf/stop-words/stop-words-english2.txt new file mode 100644 index 0000000..2a674a0 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-english2.txt @@ -0,0 +1,174 @@ +a +about +above +after +again +against +all +am +an +and +any +are +aren't +as +at +be +because +been +before +being +below +between +both +but +by +can't +cannot +could +couldn't +did +didn't +do +does +doesn't +doing +don't +down +during +each +few +for +from +further +had +hadn't +has +hasn't +have +haven't +having +he +he'd +he'll +he's +her +here +here's +hers +herself +him +himself +his +how +how's +i +i'd +i'll +i'm +i've +if +in +into +is +isn't +it +it's +its +itself +let's +me +more +most +mustn't +my +myself +no +nor +not +of +off +on +once +only +or +other +ought +our +ours +ourselves +out +over +own +same +shan't +she +she'd +she'll +she's +should +shouldn't +so +some +such +than +that +that's +the +their +theirs +them +themselves +then +there +there's +these +they +they'd +they'll +they're +they've +this +those +through +to +too +under +until +up +very +was +wasn't +we +we'd +we'll +we're +we've +were +weren't +what +what's +when +when's +where +where's +which +while +who +who's +whom +why +why's +with +won't +would +wouldn't +you +you'd +you'll +you're +you've +your +yours +yourself +yourselves diff --git a/src/tfidf/stop-words/stop-words-english3-google.txt b/src/tfidf/stop-words/stop-words-english3-google.txt new file mode 100644 index 0000000..b8c36dd --- /dev/null +++ b/src/tfidf/stop-words/stop-words-english3-google.txt @@ -0,0 +1,32 @@ +I +a +about +an +are +as +at +be +by +com +for +from +how +in +is +it +of +on +or +that +the +this +to +was +what +when +where +who +will +with +the +www diff --git a/src/tfidf/stop-words/stop-words-english4.txt b/src/tfidf/stop-words/stop-words-english4.txt new file mode 100644 index 0000000..699306a --- /dev/null +++ b/src/tfidf/stop-words/stop-words-english4.txt @@ -0,0 +1,671 @@ +a +able +about +above +abst +accordance +according +accordingly +across +act +actually +added +adj +adopted +affected +affecting +affects +after +afterwards +again +against +ah +all +almost +alone +along +already +also +although +always +am +among +amongst +an +and +announce +another +any +anybody +anyhow +anymore +anyone +anything +anyway +anyways +anywhere +apparently +approximately +are +aren +arent +arise +around +as +aside +ask +asking +at +auth +available +away +awfully +b +back +be +became +because +become +becomes +becoming +been +before +beforehand +begin +beginning +beginnings +begins +behind +being +believe +below +beside +besides +between +beyond +biol +both +brief +briefly +but +by +c +ca +came +can +cannot +can't +cause +causes +certain +certainly +co +com +come +comes +contain +containing +contains +could +couldnt +d +date +did +didn't +different +do +does +doesn't +doing +done +don't +down +downwards +due +during +e +each +ed +edu +effect +eg +eight +eighty +either +else +elsewhere +end +ending +enough +especially +et +et-al +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +except +f +far +few +ff +fifth +first +five +fix +followed +following +follows +for +former +formerly +forth +found +four +from +further +furthermore +g +gave +get +gets +getting +give +given +gives +giving +go +goes +gone +got +gotten +h +had +happens +hardly +has +hasn't +have +haven't +having +he +hed +hence +her +here +hereafter +hereby +herein +heres +hereupon +hers +herself +hes +hi +hid +him +himself +his +hither +home +how +howbeit +however +hundred +i +id +ie +if +i'll +im +immediate +immediately +importance +important +in +inc +indeed +index +information +instead +into +invention +inward +is +isn't +it +itd +it'll +its +itself +i've +j +just +k +keep +keeps +kept +keys +kg +km +know +known +knows +l +largely +last +lately +later +latter +latterly +least +less +lest +let +lets +like +liked +likely +line +little +'ll +look +looking +looks +ltd +m +made +mainly +make +makes +many +may +maybe +me +mean +means +meantime +meanwhile +merely +mg +might +million +miss +ml +more +moreover +most +mostly +mr +mrs +much +mug +must +my +myself +n +na +name +namely +nay +nd +near +nearly +necessarily +necessary +need +needs +neither +never +nevertheless +new +next +nine +ninety +no +nobody +non +none +nonetheless +noone +nor +normally +nos +not +noted +nothing +now +nowhere +o +obtain +obtained +obviously +of +off +often +oh +ok +okay +old +omitted +on +once +one +ones +only +onto +or +ord +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +owing +own +p +page +pages +part +particular +particularly +past +per +perhaps +placed +please +plus +poorly +possible +possibly +potentially +pp +predominantly +present +previously +primarily +probably +promptly +proud +provides +put +q +que +quickly +quite +qv +r +ran +rather +rd +re +readily +really +recent +recently +ref +refs +regarding +regardless +regards +related +relatively +research +respectively +resulted +resulting +results +right +run +s +said +same +saw +say +saying +says +sec +section +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sent +seven +several +shall +she +shed +she'll +shes +should +shouldn't +show +showed +shown +showns +shows +significant +significantly +similar +similarly +since +six +slightly +so +some +somebody +somehow +someone +somethan +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specifically +specified +specify +specifying +state +states +still +stop +strongly +sub +substantially +successfully +such +sufficiently +suggest +sup +sure +t +take +taken +taking +tell +tends +th +than +thank +thanks +thanx +that +that'll +thats +that've +the +their +theirs +them +themselves +then +thence +there +thereafter +thereby +thered +therefore +therein +there'll +thereof +therere +theres +thereto +thereupon +there've +these +they +theyd +they'll +theyre +they've +think +this +those +thou +though +thoughh +thousand +throug +through +throughout +thru +thus +til +tip +to +together +too +took +toward +towards +tried +tries +truly +try +trying +ts +twice +two +u +un +under +unfortunately +unless +unlike +unlikely +until +unto +up +upon +ups +us +use +used +useful +usefully +usefulness +uses +using +usually +v +value +various +'ve +very +via +viz +vol +vols +vs +w +want +wants +was +wasn't +way +we +wed +welcome +we'll +went +were +weren't +we've +what +whatever +what'll +whats +when +whence +whenever +where +whereafter +whereas +whereby +wherein +wheres +whereupon +wherever +whether +which +while +whim +whither +who +whod +whoever +whole +who'll +whom +whomever +whos +whose +why +widely +willing +wish +with +within +without +won't +words +world +would +wouldn't +www +x +y +yes +yet +you +youd +you'll +your +youre +yours +yourself +yourselves +you've +z +zero diff --git a/src/tfidf/stop-words/stop-words-english5.txt b/src/tfidf/stop-words/stop-words-english5.txt new file mode 100644 index 0000000..b8ad594 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-english5.txt @@ -0,0 +1,319 @@ +a +about +above +across +after +afterwards +again +against +all +almost +alone +along +already +also +although +always +am +among +amongst +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anyway +anywhere +are +around +as +at +back +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +below +beside +besides +between +beyond +bill +both +bottom +but +by +call +can +cannot +cant +co +computer +con +could +couldnt +cry +de +describe +detail +do +done +down +due +during +each +eg +eight +either +eleven +else +elsewhere +empty +enough +etc +even +ever +every +everyone +everything +everywhere +except +few +fifteen +fify +fill +find +fire +first +five +for +former +formerly +forty +found +four +from +front +full +further +get +give +go +had +has +hasnt +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +herse” +him +himse” +his +how +however +hundred +i +ie +if +in +inc +indeed +interest +into +is +it +its +itse” +keep +last +latter +latterly +least +less +ltd +made +many +may +me +meanwhile +might +mill +mine +more +moreover +most +mostly +move +much +must +my +myse” +name +namely +neither +never +nevertheless +next +nine +no +nobody +none +noone +nor +not +nothing +now +nowhere +of +off +often +on +once +one +only +onto +or +other +others +otherwise +our +ours +ourselves +out +over +own +part +per +perhaps +please +put +rather +re +same +see +seem +seemed +seeming +seems +serious +several +she +should +show +side +since +sincere +six +sixty +so +some +somehow +someone +something +sometime +sometimes +somewhere +still +such +system +take +ten +than +that +the +their +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +thereupon +these +they +thick +thin +third +this +those +though +three +through +throughout +thru +thus +to +together +too +top +toward +towards +twelve +twenty +two +un +under +until +up +upon +us +very +via +was +we +well +were +what +whatever +when +whence +whenever +where +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +whoever +whole +whom +whose +why +will +with +within +without +would +yet +you +your +yours +yourself +yourselves diff --git a/src/tfidf/stop-words/stop-words-finnish.txt b/src/tfidf/stop-words/stop-words-finnish.txt new file mode 100644 index 0000000..7f5742e --- /dev/null +++ b/src/tfidf/stop-words/stop-words-finnish.txt @@ -0,0 +1,747 @@ +aiemmin +aika +aikaa +aikaan +aikaisemmin +aikaisin +aikajen +aikana +aikoina +aikoo +aikovat +aina +ainakaan +ainakin +ainoa +ainoat +aiomme +aion +aiotte +aist +aivan +ajan +älä +alas +alemmas +älköön +alkuisin +alkuun +alla +alle +aloitamme +aloitan +aloitat +aloitatte +aloitattivat +aloitettava +aloitettevaksi +aloitettu +aloitimme +aloitin +aloitit +aloititte +aloittaa +aloittamatta +aloitti +aloittivat +alta +aluksi +alussa +alusta +annettavaksi +annetteva +annettu +antaa +antamatta +antoi +aoua +apu +asia +asiaa +asian +asiasta +asiat +asioiden +asioihin +asioita +asti +avuksi +avulla +avun +avutta +edellä +edelle +edelleen +edeltä +edemmäs +edes +edessä +edestä +ehkä +ei +eikä +eilen +eivät +eli +ellei +elleivät +ellemme +ellen +ellet +ellette +emme +en +enää +enemmän +eniten +ennen +ensi +ensimmäinen +ensimmäiseksi +ensimmäisen +ensimmäisenä +ensimmäiset +ensimmäisiä +ensimmäisiksi +ensimmäisinä +ensimmäistä +ensin +entinen +entisen +entisiä +entistä +entisten +eräät +eräiden +eräs +eri +erittäin +erityisesti +esi +esiin +esillä +esimerkiksi +et +eteen +etenkin +että +ette +ettei +halua +haluaa +haluamatta +haluamme +haluan +haluat +haluatte +haluavat +halunnut +halusi +halusimme +halusin +halusit +halusitte +halusivat +halutessa +haluton +hän +häneen +hänellä +hänelle +häneltä +hänen +hänessä +hänestä +hänet +he +hei +heidän +heihin +heille +heiltä +heissä +heistä +heitä +helposti +heti +hetkellä +hieman +huolimatta +huomenna +hyvä +hyvää +hyvät +hyviä +hyvien +hyviin +hyviksi +hyville +hyviltä +hyvin +hyvinä +hyvissä +hyvistä +ihan +ilman +ilmeisesti +itse +itseään +itsensä +ja +jää +jälkeen +jälleen +jo +johon +joiden +joihin +joiksi +joilla +joille +joilta +joissa +joista +joita +joka +jokainen +jokin +joko +joku +jolla +jolle +jolloin +jolta +jompikumpi +jonka +jonkin +jonne +joo +jopa +jos +joskus +jossa +josta +jota +jotain +joten +jotenkin +jotenkuten +jotka +jotta +jouduimme +jouduin +jouduit +jouduitte +joudumme +joudun +joudutte +joukkoon +joukossa +joukosta +joutua +joutui +joutuivat +joutumaan +joutuu +joutuvat +juuri +kahdeksan +kahdeksannen +kahdella +kahdelle +kahdelta +kahden +kahdessa +kahdesta +kahta +kahteen +kai +kaiken +kaikille +kaikilta +kaikkea +kaikki +kaikkia +kaikkiaan +kaikkialla +kaikkialle +kaikkialta +kaikkien +kaikkin +kaksi +kannalta +kannattaa +kanssa +kanssaan +kanssamme +kanssani +kanssanne +kanssasi +kauan +kauemmas +kautta +kehen +keiden +keihin +keiksi +keillä +keille +keiltä +keinä +keissä +keistä +keitä +keittä +keitten +keneen +keneksi +kenellä +kenelle +keneltä +kenen +kenenä +kenessä +kenestä +kenet +kenettä +kennessästä +kerran +kerta +kertaa +kesken +keskimäärin +ketä +ketkä +kiitos +kohti +koko +kokonaan +kolmas +kolme +kolmen +kolmesti +koska +koskaan +kovin +kuin +kuinka +kuitenkaan +kuitenkin +kuka +kukaan +kukin +kumpainen +kumpainenkaan +kumpi +kumpikaan +kumpikin +kun +kuten +kuuden +kuusi +kuutta +kyllä +kymmenen +kyse +lähekkäin +lähellä +lähelle +läheltä +lähemmäs +lähes +lähinnä +lähtien +läpi +liian +liki +lisää +lisäksi +luo +mahdollisimman +mahdollista +me +meidän +meillä +meille +melkein +melko +menee +meneet +menemme +menen +menet +menette +menevät +meni +menimme +menin +menit +menivät +mennessä +mennyt +menossa +mihin +mikä +mikään +mikäli +mikin +miksi +milloin +minä +minne +minun +minut +missä +mistä +mitä +mitään +miten +moi +molemmat +mones +monesti +monet +moni +moniaalla +moniaalle +moniaalta +monta +muassa +muiden +muita +muka +mukaan +mukaansa +mukana +mutta +muu +muualla +muualle +muualta +muuanne +muulloin +muun +muut +muuta +muutama +muutaman +muuten +myöhemmin +myös +myöskään +myöskin +myötä +näiden +näin +näissä +näissähin +näissälle +näissältä +näissästä +näitä +nämä +ne +neljä +neljää +neljän +niiden +niin +niistä +niitä +noin +nopeammin +nopeasti +nopeiten +nro +nuo +nyt +ohi +oikein +ole +olemme +olen +olet +olette +oleva +olevan +olevat +oli +olimme +olin +olisi +olisimme +olisin +olisit +olisitte +olisivat +olit +olitte +olivat +olla +olleet +olli +ollut +oma +omaa +omaan +omaksi +omalle +omalta +oman +omassa +omat +omia +omien +omiin +omiksi +omille +omilta +omissa +omista +on +onkin +onko +ovat +päälle +paikoittain +paitsi +pakosti +paljon +paremmin +parempi +parhaillaan +parhaiten +peräti +perusteella +pian +pieneen +pieneksi +pienellä +pienelle +pieneltä +pienempi +pienestä +pieni +pienin +puolesta +puolestaan +runsaasti +saakka +sadam +sama +samaa +samaan +samalla +samallalta +samallassa +samallasta +saman +samat +samoin +sata +sataa +satojen +se +seitsemän +sekä +sen +seuraavat +siellä +sieltä +siihen +siinä +siis +siitä +sijaan +siksi +sillä +silloin +silti +sinä +sinne +sinua +sinulle +sinulta +sinun +sinussa +sinusta +sinut +sisäkkäin +sisällä +sitä +siten +sitten +suoraan +suuntaan +suuren +suuret +suuri +suuria +suurin +suurten +taa +täällä +täältä +taas +taemmas +tähän +tahansa +tai +takaa +takaisin +takana +takia +tällä +tällöin +tämä +tämän +tänä +tänään +tänne +tapauksessa +tässä +tästä +tätä +täten +tavalla +tavoitteena +täysin +täytyvät +täytyy +te +tietysti +todella +toinen +toisaalla +toisaalle +toisaalta +toiseen +toiseksi +toisella +toiselle +toiselta +toisemme +toisen +toisensa +toisessa +toisesta +toista +toistaiseksi +toki +tosin +tuhannen +tuhat +tule +tulee +tulemme +tulen +tulet +tulette +tulevat +tulimme +tulin +tulisi +tulisimme +tulisin +tulisit +tulisitte +tulisivat +tulit +tulitte +tulivat +tulla +tulleet +tullut +tuntuu +tuo +tuolla +tuolloin +tuolta +tuonne +tuskin +tykö +usea +useasti +useimmiten +usein +useita +uudeksi +uudelleen +uuden +uudet +uusi +uusia +uusien +uusinta +uuteen +uutta +vaan +vähän +vähemmän +vähintään +vähiten +vai +vaiheessa +vaikea +vaikean +vaikeat +vaikeilla +vaikeille +vaikeilta +vaikeissa +vaikeista +vaikka +vain +välillä +varmasti +varsin +varsinkin +varten +vasta +vastaan +vastakkain +verran +vielä +vierekkäin +vieri +viiden +viime +viimeinen +viimeisen +viimeksi +viisi +voi +voidaan +voimme +voin +voisi +voit +voitte +voivat +vuoden +vuoksi +vuosi +vuosien +vuosina +vuotta +yhä +yhdeksän +yhden +yhdessä +yhtä +yhtäällä +yhtäälle +yhtäältä +yhtään +yhteen +yhteensä +yhteydessä +yhteyteen +yksi +yksin +yksittäin +yleensä +ylemmäs +yli +ylös +ympäri diff --git a/src/tfidf/stop-words/stop-words-french.txt b/src/tfidf/stop-words/stop-words-french.txt new file mode 100644 index 0000000..b44c17d --- /dev/null +++ b/src/tfidf/stop-words/stop-words-french.txt @@ -0,0 +1,126 @@ +alors +au +aucuns +aussi +autre +avant +avec +avoir +bon +car +ce +cela +ces +ceux +chaque +ci +comme +comment +dans +des +du +dedans +dehors +depuis +deux +devrait +doit +donc +dos +droite +début +elle +elles +en +encore +essai +est +et +eu +fait +faites +fois +font +force +haut +hors +ici +il +ils +je +juste +la +le +les +leur +là +ma +maintenant +mais +mes +mine +moins +mon +mot +même +ni +nommés +notre +nous +nouveaux +ou +où +par +parce +parole +pas +personnes +peut +peu +pièce +plupart +pour +pourquoi +quand +que +quel +quelle +quelles +quels +qui +sa +sans +ses +seulement +si +sien +son +sont +sous +soyez +sujet +sur +ta +tandis +tellement +tels +tes +ton +tous +tout +trop +très +tu +valeur +voie +voient +vont +votre +vous +vu +ça +étaient +état +étions +été +être diff --git a/src/tfidf/stop-words/stop-words-german.txt b/src/tfidf/stop-words/stop-words-german.txt new file mode 100644 index 0000000..af52194 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-german.txt @@ -0,0 +1,129 @@ +aber +als +am +an +auch +auf +aus +bei +bin +bis +bist +da +dadurch +daher +darum +das +daß +dass +dein +deine +dem +den +der +des +dessen +deshalb +die +dies +dieser +dieses +doch +dort +du +durch +ein +eine +einem +einen +einer +eines +er +es +euer +eure +für +hatte +hatten +hattest +hattet +hier +hinter +ich +ihr +ihre +im +in +ist +ja +jede +jedem +jeden +jeder +jedes +jener +jenes +jetzt +kann +kannst +können +könnt +machen +mein +meine +mit +muß +mußt +musst +müssen +müßt +nach +nachdem +nein +nicht +nun +oder +seid +sein +seine +sich +sie +sind +soll +sollen +sollst +sollt +sonst +soweit +sowie +und +unser +unsere +unter +vom +von +vor +wann +warum +was +weiter +weitere +wenn +wer +werde +werden +werdet +weshalb +wie +wieder +wieso +wir +wird +wirst +wo +woher +wohin +zu +zum +zur +über diff --git a/src/tfidf/stop-words/stop-words-greek.txt b/src/tfidf/stop-words/stop-words-greek.txt new file mode 100644 index 0000000..9e7eba3 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-greek.txt @@ -0,0 +1,79 @@ +μή +ἑαυτοῦ +ἄν +ἀλλ’ +ἀλλά +ἄλλοσ +ἀπό +ἄρα +αὐτόσ +δ’ +δέ +δή +διά +δαί +δαίσ +ἔτι +ἐγώ +ἐκ +ἐμόσ +ἐν +ἐπί +εἰ +εἰμί +εἴμι +εἰσ +γάρ +γε +γα^ +ἡ +ἤ +καί +κατά +μέν +μετά +μή +ὁ +ὅδε +ὅσ +ὅστισ +ὅτι +οὕτωσ +οὗτοσ +οὔτε +οὖν +οὐδείσ +οἱ +οὐ +οὐδέ +οὐκ +περί +πρόσ +σύ +σύν +τά +τε +τήν +τῆσ +τῇ +τι +τί +τισ +τίσ +τό +τοί +τοιοῦτοσ +τόν +τούσ +τοῦ +τῶν +τῷ +ὑμόσ +ὑπέρ +ὑπό +ὡσ +ὦ +ὥστε +ἐάν +παρά +σόσ diff --git a/src/tfidf/stop-words/stop-words-hungarian.txt b/src/tfidf/stop-words/stop-words-hungarian.txt new file mode 100644 index 0000000..e23d97f --- /dev/null +++ b/src/tfidf/stop-words/stop-words-hungarian.txt @@ -0,0 +1,35 @@ +a +át +az +be +csak +de +egy +el +én +és +fel +hát +hogy +ide +igen +ki +le +lesz +meg +mi +mint +nem +õ +oda +õk +ön +össze +rá +szét +te +ti +vagy +van +vissza +volt diff --git a/src/tfidf/stop-words/stop-words-italian.txt b/src/tfidf/stop-words/stop-words-italian.txt new file mode 100644 index 0000000..bdab8ac --- /dev/null +++ b/src/tfidf/stop-words/stop-words-italian.txt @@ -0,0 +1,134 @@ +a +adesso +ai +al +alla +allo +allora +altre +altri +altro +anche +ancora +avere +aveva +avevano +ben +buono +che +chi +cinque +comprare +con +consecutivi +consecutivo +cosa +cui +da +del +della +dello +dentro +deve +devo +di +doppio +due +e +ecco +fare +fine +fino +fra +gente +giu +ha +hai +hanno +ho +il +indietro +invece +io +la +lavoro +le +lei +lo +loro +lui +lungo +ma +me +meglio +molta +molti +molto +nei +nella +no +noi +nome +nostro +nove +nuovi +nuovo +o +oltre +ora +otto +peggio +pero +persone +piu +poco +primo +promesso +qua +quarto +quasi +quattro +quello +questo +qui +quindi +quinto +rispetto +sara +secondo +sei +sembra +sembrava +senza +sette +sia +siamo +siete +solo +sono +sopra +soprattutto +sotto +stati +stato +stesso +su +subito +sul +sulla +tanto +te +tempo +terzo +tra +tre +triplo +ultimo +un +una +uno +va +vai +voi +volte +vostro diff --git a/src/tfidf/stop-words/stop-words-norwegian.txt b/src/tfidf/stop-words/stop-words-norwegian.txt new file mode 100644 index 0000000..4e2a039 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-norwegian.txt @@ -0,0 +1,119 @@ +Å +alle +andre +arbeid +av +begge +bort +bra +bruke +da +denne +der +deres +det +din +disse +du +eller +en +ene +eneste +enhver +enn +er +et +fÅ +folk +for +fordi +forsÛke +fra +fÛr +fÛrst +gÅ +gjorde +gjÛre +god +ha +hadde +han +hans +hennes +her +hva +hvem +hver +hvilken +hvis +hvor +hvordan +hvorfor +i +ikke +inn +innen +kan +kunne +lage +lang +lik +like +mÅ +makt +mange +mÅte +med +meg +meget +men +mens +mer +mest +min +mye +nÅ +nÅr +navn +nei +ny +og +ogsÅ +om +opp +oss +over +pÅ +part +punkt +rett +riktig +sÅ +samme +sant +si +siden +sist +skulle +slik +slutt +som +start +stille +tid +til +tilbake +tilstand +under +ut +uten +var +vÅr +ved +verdi +vi +vil +ville +vite +vÖre +vÖrt diff --git a/src/tfidf/stop-words/stop-words-polish1.txt b/src/tfidf/stop-words/stop-words-polish1.txt new file mode 100644 index 0000000..2f8823a --- /dev/null +++ b/src/tfidf/stop-words/stop-words-polish1.txt @@ -0,0 +1,138 @@ +ach +aj +albo +bardzo +bez +bo +być +ci +cię +ciebie +co +czy +daleko +dla +dlaczego +dlatego +do +dobrze +dokąd +dość +dużo +dwa +dwaj +dwie +dwoje +dziś +dzisiaj +gdyby +gdzie +go +ich +ile +im +inny +ja +ją +jak +jakby +jaki +je +jeden +jedna +jedno +jego +jej +jemu +jeśli +jest +jestem +jeżeli +już +każdy +kiedy +kierunku +kto +ku +lub +ma +mają +mam +mi +mną +mnie +moi +mój +moja +moje +może +mu +my +na +nam +nami +nas +nasi +nasz +nasza +nasze +natychmiast +nią +nic +nich +nie +niego +niej +niemu +nigdy +nim +nimi +niż +obok +od +około +on +ona +one +oni +ono +owszem +po +pod +ponieważ +przed +przedtem +są +sam +sama +się +skąd +tak +taki +tam +ten +to +tobą +tobie +tu +tutaj +twoi +twój +twoja +twoje +ty +wam +wami +was +wasi +wasz +wasza +wasze +we +więc +wszystko +wtedy +wy +żaden +zawsze +że diff --git a/src/tfidf/stop-words/stop-words-polish2.txt b/src/tfidf/stop-words/stop-words-polish2.txt new file mode 100644 index 0000000..cd16c2a --- /dev/null +++ b/src/tfidf/stop-words/stop-words-polish2.txt @@ -0,0 +1,272 @@ +a +aby +ach +acz +aczkolwiek +aj +albo +ale +ależ +aż +bardziej +bardzo +bez +bo +bowiem +by +byli +bynajmniej +być +był +była +było +były +będzie +będą +cali +cała +cały +ci +cię +ciebie +co +cokolwiek +coś +czasami +czasem +czemu +czy +czyli +daleko +dla +dlaczego +dlatego +do +dobrze +dokąd +dość +dużo +dwa +dwaj +dwie +dwoje +dziś +dzisiaj +gdy +gdyby +gdyż +gdzie +gdziekolwiek +gdzieś +go +i +ich +ile +im +inna +inne +inny +innych +iż +ja +ją +jak +jakaś +jakby +jaki +jakichś +jakie +jakiś +jakiż +jakkolwiek +jako +jakoś +je +jeden +jedna +jedno +jednak +jednakże +jego +jej +jemu +jest +jestem +jeszcze +jeśli +jeżeli +już +ją +każdy +kiedy +kilka +kimś +kto +ktokolwiek +ktoś +która +które +którego +której +który +których +którym +którzy +ku +lat +lecz +lub +ma +mają +mam +mi +mimo +między +mną +mnie +mogą +moi +moim +moja +moje +może +możliwe +można +mój +mu +musi +my +na +nad +nam +nami +nas +nasi +nasz +nasza +nasze +naszego +naszych +natomiast +natychmiast +nawet +nią +nic +nich +nie +niego +niej +niemu +nigdy +nim +nimi +niż +no +o +obok +od +około +on +ona +one +oni +ono +oraz +owszem +pan +pana +pani +po +pod +podczas +pomimo +ponad +ponieważ +powinien +powinna +powinni +powinno +poza +prawie +przecież +przed +przede +przedtem +przez +przy +roku +również +sam +sama +są +się +skąd +sobie +sobą +sposób +swoje +są +ta +tak +taka +taki +takie +także +tam +te +tego +tej +ten +teraz +też +totobą +tobie +toteż +trzeba +tu +tutaj +twoi +twoim +twoja +twoje +twym +twój +ty +tych +tylko +tym +u +w +wam +wami +was +wasz +wasza +wasze +we +według +wiele +wielu +więc +więcej +wszyscy +wszystkich +wszystkie +wszystkim +wszystko +wtedy +wy +właśnie +z +za +zapewne +zawsze +zeznowu +znów +został +żaden +żadna +żadne +żadnych +że +żeby \ No newline at end of file diff --git a/src/tfidf/stop-words/stop-words-portugese.txt b/src/tfidf/stop-words/stop-words-portugese.txt new file mode 100644 index 0000000..cc0b170 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-portugese.txt @@ -0,0 +1,147 @@ +acerca +agora +algmas +alguns +ali +ambos +antes +apontar +aquela +aquelas +aquele +aqueles +aqui +atrás +bem +bom +cada +caminho +cima +com +como +comprido +conhecido +corrente +das +debaixo +dentro +desde +desligado +deve +devem +deverá +direita +diz +dizer +dois +dos +e +é +ela +ele +eles +em +enquanto +então +está +estado +estão +estar +estará +este +estes +esteve +estive +estivemos +estiveram +eu +fará +faz +fazer +fazia +fez +fim +foi +fora +horas +iniciar +inicio +ir +irá +ista +iste +isto +ligado +maioria +maiorias +mais +mas +mesmo +meu +muito +muitos +não +nome +nós +nosso +novo +o +onde +os +ou +outro +para +parte +pegar +pelo +pessoas +pode +poderá +podia +por +porque +povo +promeiro +qual +qualquer +quando +quê +quem +quieto +saber +são +sem +ser +seu +somente +tal +também +tem +têm +tempo +tenho +tentar +tentaram +tente +tentei +teu +teve +tipo +tive +todos +trabalhar +trabalho +tu +último +um +uma +umas +uns +usa +usar +valor +veja +ver +verdade +verdadeiro +você diff --git a/src/tfidf/stop-words/stop-words-russian.txt b/src/tfidf/stop-words/stop-words-russian.txt new file mode 100644 index 0000000..9498480 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-russian.txt @@ -0,0 +1,421 @@ +а +е +и +ж +м +о +на +не +ни +об +но +он +мне +мои +мож +она +они +оно +мной +много +многочисленное +многочисленная +многочисленные +многочисленный +мною +мой +мог +могут +можно +может +можхо +мор +моя +моё +мочь +над +нее +оба +нам +нем +нами +ними +мимо +немного +одной +одного +менее +однажды +однако +меня +нему +меньше +ней +наверху +него +ниже +мало +надо +один +одиннадцать +одиннадцатый +назад +наиболее +недавно +миллионов +недалеко +между +низко +меля +нельзя +нибудь +непрерывно +наконец +никогда +никуда +нас +наш +нет +нею +неё +них +мира +наша +наше +наши +ничего +начала +нередко +несколько +обычно +опять +около +мы +ну +нх +от +отовсюду +особенно +нужно +очень +отсюда +в +во +вон +вниз +внизу +вокруг +вот +восемнадцать +восемнадцатый +восемь +восьмой +вверх +вам +вами +важное +важная +важные +важный +вдали +везде +ведь +вас +ваш +ваша +ваше +ваши +впрочем +весь +вдруг +вы +все +второй +всем +всеми +времени +время +всему +всего +всегда +всех +всею +всю +вся +всё +всюду +г +год +говорил +говорит +года +году +где +да +ее +за +из +ли +же +им +до +по +ими +под +иногда +довольно +именно +долго +позже +более +должно +пожалуйста +значит +иметь +больше +пока +ему +имя +пор +пора +потом +потому +после +почему +почти +посреди +ей +два +две +двенадцать +двенадцатый +двадцать +двадцатый +двух +его +дел +или +без +день +занят +занята +занято +заняты +действительно +давно +девятнадцать +девятнадцатый +девять +девятый +даже +алло +жизнь +далеко +близко +здесь +дальше +для +лет +зато +даром +первый +перед +затем +зачем +лишь +десять +десятый +ею +её +их +бы +еще +при +был +про +процентов +против +просто +бывает +бывь +если +люди +была +были +было +будем +будет +будете +будешь +прекрасно +буду +будь +будто +будут +ещё +пятнадцать +пятнадцатый +друго +другое +другой +другие +другая +других +есть +пять +быть +лучше +пятый +к +ком +конечно +кому +кого +когда +которой +которого +которая +которые +который +которых +кем +каждое +каждая +каждые +каждый +кажется +как +какой +какая +кто +кроме +куда +кругом +с +т +у +я +та +те +уж +со +то +том +снова +тому +совсем +того +тогда +тоже +собой +тобой +собою +тобою +сначала +только +уметь +тот +тою +хорошо +хотеть +хочешь +хоть +хотя +свое +свои +твой +своей +своего +своих +свою +твоя +твоё +раз +уже +сам +там +тем +чем +сама +сами +теми +само +рано +самом +самому +самой +самого +семнадцать +семнадцатый +самим +самими +самих +саму +семь +чему +раньше +сейчас +чего +сегодня +себе +тебе +сеаой +человек +разве +теперь +себя +тебя +седьмой +спасибо +слишком +так +такое +такой +такие +также +такая +сих +тех +чаще +четвертый +через +часто +шестой +шестнадцать +шестнадцатый +шесть +четыре +четырнадцать +четырнадцатый +сколько +сказал +сказала +сказать +ту +ты +три +эта +эти +что +это +чтоб +этом +этому +этой +этого +чтобы +этот +стал +туда +этим +этими +рядом +тринадцать +тринадцатый +этих +третий +тут +эту +суть +чуть +тысяч diff --git a/src/tfidf/stop-words/stop-words-slovak.txt b/src/tfidf/stop-words/stop-words-slovak.txt new file mode 100644 index 0000000..07da7b3 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-slovak.txt @@ -0,0 +1,173 @@ +a +aby +aj +ak +ako +ale +alebo +and +ani +áno +asi +až +bez +bude +budem +budeš +budeme +budete +budú +by +bol +bola +boli +bolo +byť +cez +čo +či +ďalší +ďalšia +ďalšie +dnes +do +ho +ešte +for +i +ja +je +jeho +jej +ich +iba +iné +iný +som +si +sme +sú +k +kam +každý +každá +každé +každí +kde +keď +kto +ktorá +ktoré +ktorou +ktorý +ktorí +ku +lebo +len +ma +mať +má +máte +medzi +mi +mna +mne +mnou +musieť +môcť +môj +môže +my +na +nad +nám +náš +naši +nie +nech +než +nič +niektorý +nové +nový +nová +nové +noví +o +od +odo +of +on +ona +ono +oni +ony +po +pod +podľa +pokiaľ +potom +práve +pre +prečo +preto +pretože +prvý +prvá +prvé +prví +pred +predo +pri +pýta +s +sa +so +si +svoje +svoj +svojich +svojím +svojími +ta +tak +takže +táto +teda +te +tě +ten +tento +the +tieto +tým +týmto +tiež +to +toto +toho +tohoto +tom +tomto +tomuto +toto +tu +tú +túto +tvoj +ty +tvojími +už +v +vám +váš +vaše +vo +viac +však +všetok +vy +z +za +zo +že diff --git a/src/tfidf/stop-words/stop-words-slovak2.txt b/src/tfidf/stop-words/stop-words-slovak2.txt new file mode 100644 index 0000000..d21096a --- /dev/null +++ b/src/tfidf/stop-words/stop-words-slovak2.txt @@ -0,0 +1,106 @@ +a +aby +aj +ako +ale +alebo +ani +áno +asi +až +bez +buď +by +cez +či +čo +ešte +ho +i +iba +ich +ja +je +jeho +jej +ju +k +kam +kde +keď +kto +ku +menej +mi +moja +moje +môj +my +nad +nám +než +nič +nie +o +od +on +on +ona +ona +oni +ono +po +pod +podľa +pokiaľ +potom +práve +prečo +pred +preto +pretože +pri +s +sa +si +sme +so +som +späť +ste +sú +sú +ta +tá +tak +tak +takže +tam +tam +táto +teda +ten +tento +tieto +tiež +to +to +toho +tom +tomto +toto +tu +túto +ty +tým +týmto +už +v +vám +viac +vo +však +vy +z +za +zo diff --git a/src/tfidf/stop-words/stop-words-spanish.txt b/src/tfidf/stop-words/stop-words-spanish.txt new file mode 100644 index 0000000..8095338 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-spanish.txt @@ -0,0 +1,181 @@ +algún +alguna +algunas +alguno +algunos +ambos +ampleamos +ante +antes +aquel +aquellas +aquellos +aqui +arriba +atras +bajo +bastante +bien +cada +cierta +ciertas +cierto +ciertos +como +con +conseguimos +conseguir +consigo +consigue +consiguen +consigues +cual +cuando +de +dentro +desde +donde +dos +el +ellas +ellos +empleais +emplean +emplear +empleas +empleo +en +encima +entonces +entre +era +eramos +eran +eras +eres +es +esta +estaba +estado +estais +estamos +estan +estoy +fin +fue +fueron +fui +fuimos +gueno +ha +hace +haceis +hacemos +hacen +hacer +haces +hago +incluso +intenta +intentais +intentamos +intentan +intentar +intentas +intento +ir +la +largo +las +lo +los +mientras +mio +modo +muchos +muy +nos +nosotros +otro +para +pero +podeis +podemos +poder +podria +podriais +podriamos +podrian +podrias +por +por +porque +primero +puede +pueden +puedo +que +qué +quien +sabe +sabeis +sabemos +saben +saber +sabes +ser +si +siendo +sin +sobre +sois +solamente +solo +somos +soy +su +sus +también +teneis +tenemos +tener +tengo +tiempo +tiene +tienen +todo +trabaja +trabajais +trabajamos +trabajan +trabajar +trabajas +trabajo +tras +tuyo +ultimo +un +una +unas +uno +unos +usa +usais +usamos +usan +usar +usas +uso +va +vais +valor +vamos +van +vaya +verdad +verdadera +verdadero +vosotras +vosotros +voy +yo diff --git a/src/tfidf/stop-words/stop-words-swedish.txt b/src/tfidf/stop-words/stop-words-swedish.txt new file mode 100644 index 0000000..05c3e14 --- /dev/null +++ b/src/tfidf/stop-words/stop-words-swedish.txt @@ -0,0 +1,386 @@ +aderton +adertonde +adjö +aldrig +alla +allas +allt +alltid +alltså +än +andra +andras +annan +annat +ännu +artonde +artonn +åtminstone +att +åtta +åttio +åttionde +åttonde +av +även +båda +bådas +bakom +bara +bäst +bättre +behöva +behövas +behövde +behövt +beslut +beslutat +beslutit +bland +blev +bli +blir +blivit +bort +borta +bra +då +dag +dagar +dagarna +dagen +där +därför +de +del +delen +dem +den +deras +dess +det +detta +dig +din +dina +dit +ditt +dock +du +efter +eftersom +elfte +eller +elva +en +enkel +enkelt +enkla +enligt +er +era +ert +ett +ettusen +få +fanns +får +fått +fem +femte +femtio +femtionde +femton +femtonde +fick +fin +finnas +finns +fjärde +fjorton +fjortonde +fler +flera +flesta +följande +för +före +förlåt +förra +första +fram +framför +från +fyra +fyrtio +fyrtionde +gå +gälla +gäller +gällt +går +gärna +gått +genast +genom +gick +gjorde +gjort +god +goda +godare +godast +gör +göra +gott +ha +hade +haft +han +hans +har +här +heller +hellre +helst +helt +henne +hennes +hit +hög +höger +högre +högst +hon +honom +hundra +hundraen +hundraett +hur +i +ibland +idag +igår +igen +imorgon +in +inför +inga +ingen +ingenting +inget +innan +inne +inom +inte +inuti +ja +jag +jämfört +kan +kanske +knappast +kom +komma +kommer +kommit +kr +kunde +kunna +kunnat +kvar +länge +längre +långsam +långsammare +långsammast +långsamt +längst +långt +lätt +lättare +lättast +legat +ligga +ligger +lika +likställd +likställda +lilla +lite +liten +litet +man +många +måste +med +mellan +men +mer +mera +mest +mig +min +mina +mindre +minst +mitt +mittemot +möjlig +möjligen +möjligt +möjligtvis +mot +mycket +någon +någonting +något +några +när +nästa +ned +nederst +nedersta +nedre +nej +ner +ni +nio +nionde +nittio +nittionde +nitton +nittonde +nödvändig +nödvändiga +nödvändigt +nödvändigtvis +nog +noll +nr +nu +nummer +och +också +ofta +oftast +olika +olikt +om +oss +över +övermorgon +överst +övre +på +rakt +rätt +redan +så +sade +säga +säger +sagt +samma +sämre +sämst +sedan +senare +senast +sent +sex +sextio +sextionde +sexton +sextonde +sig +sin +sina +sist +sista +siste +sitt +sjätte +sju +sjunde +sjuttio +sjuttionde +sjutton +sjuttonde +ska +skall +skulle +slutligen +små +smått +snart +som +stor +stora +större +störst +stort +tack +tidig +tidigare +tidigast +tidigt +till +tills +tillsammans +tio +tionde +tjugo +tjugoen +tjugoett +tjugonde +tjugotre +tjugotvå +tjungo +tolfte +tolv +tre +tredje +trettio +trettionde +tretton +trettonde +två +tvåhundra +under +upp +ur +ursäkt +ut +utan +utanför +ute +vad +vänster +vänstra +var +vår +vara +våra +varför +varifrån +varit +varken +värre +varsågod +vart +vårt +vem +vems +verkligen +vi +vid +vidare +viktig +viktigare +viktigast +viktigt +vilka +vilken +vilket +vill diff --git a/src/tfidf/stop-words/stop-words-turkish.txt b/src/tfidf/stop-words/stop-words-turkish.txt new file mode 100644 index 0000000..03cdd1d --- /dev/null +++ b/src/tfidf/stop-words/stop-words-turkish.txt @@ -0,0 +1,114 @@ +acaba +altmýþ +altý +ama +bana +bazý +belki +ben +benden +beni +benim +beþ +bin +bir +biri +birkaç +birkez +birþey +birþeyi +biz +bizden +bizi +bizim +bu +buna +bunda +bundan +bunu +bunun +da +daha +dahi +de +defa +diye +doksan +dokuz +dört +elli +en +gibi +hem +hep +hepsi +her +hiç +iki +ile +INSERmi +ise +için +katrilyon +kez +ki +kim +kimden +kime +kimi +kýrk +milyar +milyon +mu +mü +mý +nasýl +ne +neden +nerde +nerede +nereye +niye +niçin +on +ona +ondan +onlar +onlardan +onlari +onlarýn +onu +otuz +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +trilyon +tüm +ve +veya +ya +yani +yedi +yetmiþ +yirmi +yüz +çok +çünkü +üç +þey +þeyden +þeyi +þeyler +þu +þuna +þunda +þundan +þunu diff --git a/src/tfidf/tfidf.go b/src/tfidf/tfidf.go new file mode 100644 index 0000000..4d78b4f --- /dev/null +++ b/src/tfidf/tfidf.go @@ -0,0 +1,182 @@ +package tfidf + +import ( + "fmt" + "io/ioutil" + "math" + "regexp" + "sort" + "strings" +) + +type TfIdfMatrix struct { + Stopwords []string + TotalFreq map[string]float64 + DocFreqs []map[string]float64 +} + +// totalFreq := make(map[string]float64) +// docFreqs := make([]map[string]float64, 0) + +func (matrix *TfIdfMatrix) LoadStopWords(filename string) { + contents, err := ioutil.ReadFile(filename) + if err != nil { + fmt.Println(err) + return + } + + elements := strings.Fields(string(contents)) + + fmt.Printf("Loaded %d stopwords.\n", len(elements)) + matrix.Stopwords = append(matrix.Stopwords, elements...) +} + +func (matrix *TfIdfMatrix) checkStopWords(word string) bool { + result := false + + for _, stopword := range matrix.Stopwords { + if word == stopword { + result = true + break + } + } + + return result +} + +func ClearHtml(text string) string { + htmlCleaner, _ := regexp.Compile("<.*?>") + result := string(htmlCleaner.ReplaceAllString(text, "")) + + return result +} + +func (matrix *TfIdfMatrix) TfCalc(text string) map[string]float64 { + tfMatrix := make(map[string]float64) + + punctuation := regexp.MustCompile("(,|;|:|!||¡|¿)") + noStrangeChars := regexp.MustCompile("^[A-Za-z]*?$") + + temp := strings.Replace(text, ".", " ", -1) + temp = strings.Replace(temp, "?", " ", -1) + + temp = string(punctuation.ReplaceAllString(temp, "")) + + temp = strings.ToLower(temp) + + for _, value := range strings.Fields(temp) { + if noStrangeChars.MatchString(value) && !matrix.checkStopWords(value) { + if amount, ok := tfMatrix[value]; ok { + tfMatrix[value] = amount + 1.0 + } else { + tfMatrix[value] = 1.0 + } + } + } + + return tfMatrix +} + +func (matrix *TfIdfMatrix) DfUpdate(docFreqs map[string]float64) { + if matrix.TotalFreq == nil { + matrix.TotalFreq = make(map[string]float64) + } + + for word, quantity := range docFreqs { + if amount, ok := matrix.TotalFreq[word]; ok { + matrix.TotalFreq[word] = amount + quantity + } else { + matrix.TotalFreq[word] = quantity + } + } +} + +func (matrix *TfIdfMatrix) CalculateIdf(totalDocNo int) { + for word, amount := range matrix.TotalFreq { + matrix.TotalFreq[word] = math.Log(float64(totalDocNo) / amount) + } +} + +func (matrix *TfIdfMatrix) PrintTfIdf(documentFreqs map[string]float64) { + maxTfIdf := 0.0 + + res := make(map[string]float64, 0) + + for word, amount := range documentFreqs { + if value, ok := matrix.TotalFreq[word]; ok { + temp := amount * value + res[word] = temp + if temp > maxTfIdf { + maxTfIdf = temp + } + } else { + temp := amount + res[word] = temp + if temp > maxTfIdf { + maxTfIdf = temp + } + } + } + + for word, amount := range res { + normalized := (amount / maxTfIdf) * 50.0 // Normalize and extend to 50 range + if !(normalized < 1.0) { + fmt.Printf("%s:\n", word) + fmt.Println(strings.Repeat("#", int(normalized))) + } + } +} + +func (matrix *TfIdfMatrix) SearchDocumentsByWord(text string) ([]int, []float64) { + match := make([]float64, 0) + docs := make([]int, 0) + result := make([]int, 0) + + for count, document := range matrix.DocFreqs { + if value, ok := document[text]; ok { + temp := matrix.TotalFreq[text] + fr := temp * value + if fr > 0 { + match = append(match, temp*value) + docs = append(docs, count) + } + } + } + + temp := make([]float64, len(match)) + _ = copy(temp, match) + + sort.Sort(sort.Reverse(sort.Float64Slice(temp))) + + for cnt, freq := range temp { + if cnt > 5 { + break + } + count := 0 + value := match[count] + for value != freq { + count++ + value = match[count] + } + match[count] = -1.0 // Mark this result as used + result = append(result, count) + } + + if len(result) > 5 { + return result[:5], temp[:5] + } else { + return result, temp + } +} + +func (matrix *TfIdfMatrix) TfIdf(texts []string) { + totalDocuments := len(texts) + + for _, text := range texts { + tf := matrix.TfCalc(text) + matrix.DocFreqs = append(matrix.DocFreqs, tf) + matrix.DfUpdate(tf) + } + + matrix.CalculateIdf(totalDocuments) +} diff --git a/test.go b/test.go new file mode 100644 index 0000000..33e0720 --- /dev/null +++ b/test.go @@ -0,0 +1,15 @@ + +package main + +import ( + "tfidf" +) + +func main() { + text1 := "This is a text for testing the TFIDF, but we expect the algorithm to crash." + text2 := "Is that so? We create the world that is put in front of your eyes." + + input := []string{text1, text2} + + tfidf.TfIdf(input) +}