tfidf/main.go

208 lines
4.6 KiB
Go

package main
import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strings"
"os"
"tfidf"
)
func getLinks(text, mainUrl string) []string {
result := make([]string, 0)
hrefLocator, _ := regexp.Compile("href=(\"|').*?(\"|')")
elems := hrefLocator.FindAllString(text, -1)
for _, link := range elems {
temp := strings.Replace(link, "href=", "", -1)
temp = strings.Replace(temp, "\"", "", -1)
temp = strings.Replace(temp, "'", "", -1)
switch {
case strings.Contains(temp, ".png"):
continue
case strings.Contains(temp, ".jpg"):
continue
case strings.Contains(temp, ".gif"):
continue
case strings.Contains(temp, ".xml"):
continue
case strings.Contains(temp, ".css"):
continue
case strings.Contains(temp, ".js"):
continue
case !strings.Contains(temp, mainUrl):
continue
default:
result = append(result, temp)
}
}
return result
}
func fetchAndGetLinks(url string, output chan []string) {
result := make([]string, 0)
resp, err := http.Get(url)
if err != nil {
output <- result
return
}
body, err := ioutil.ReadAll(resp.Body)
defer resp.Body.Close()
if err != nil {
output <- result
return
}
links := getLinks(string(body), url)
output <- links
}
func fetchText(url string, output chan []string) {
resp, err := http.Get(url)
if err != nil {
output <- nil
return
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
output <- nil
return
}
if strings.Contains(string(body), "<html") && strings.Contains(string(body), "<body") {
output <- []string{url, tfidf.ClearHtml(string(body))}
} else {
output <- nil
}
}
func getDocuments(websites []string) ([]string, []string) {
allDocsUris := make([]string, 0) // Initial, empty
allDocsUris = append(allDocsUris, websites...) // The index page IS a document
results := make([]string, 0)
sites := make([]string, 0)
webChannel := make(chan []string, 10)
textChannel := make(chan []string, 10)
fmt.Println("Initial link parsing...")
for _, url := range websites {
fmt.Printf("Fetching: %s\n", url)
go fetchAndGetLinks(url, webChannel)
}
for count := 0; count < len(websites); count++ {
links := <-webChannel
fmt.Printf("Received %d links...\n", len(links))
allDocsUris = append(allDocsUris, links...)
}
fmt.Printf("Now fetching texts from %d urls\n", len(allDocsUris))
for count, url := range allDocsUris {
fmt.Printf("%d of %d - Fetching: %s\n", count, len(allDocsUris), url)
go fetchText(url, textChannel)
}
for count := 0; count < len(allDocsUris); count++ {
fmt.Printf("Received %d web pages.\n", count)
text := <-textChannel
if text != nil {
sites = append(sites, text[0])
results = append(results, text[1])
}
}
return results, sites
}
func loadWebsites(filename string) []string {
content, err := ioutil.ReadFile(filename)
if err != nil {
fmt.Println(err)
return nil
}
elements := strings.Fields(string(content))
fmt.Printf("Loaded %d websites", len(elements))
return elements
}
func getCommandLineArgument(option, defaultValue string) string {
result := defaultValue
if len(os.Args) > 1 {
for count := 1; count < len(os.Args); count++ {
if strings.Contains(os.Args[count], option) {
result = os.Args[count]
result = strings.Replace(result, option, "", -1)
result = strings.TrimSpace(result)
if result == "" {
result = os.Args[count + 1]
result = strings.Replace(result, option, "", -1)
result = strings.TrimSpace(result)
}
break
}
}
}
return result
}
func getWebsitesArgument() string {
return getCommandLineArgument("-w", "websites.txt")
}
func getStopWordsArgument() string {
return getCommandLineArgument("-s", "english1")
}
func main() {
var searchTerm string
var matrix tfidf.TfIdfMatrix
fmt.Println("Init the TfIdf with stopwords")
stopwordsLanguage := getStopWordsArgument()
matrix.LoadStopWords(fmt.Sprintf("./src/tfidf/stop-words/stop-words-%s.txt", stopwordsLanguage))
websites := loadWebsites(getWebsitesArgument())
input, sites := getDocuments(websites)
matrix.TfIdf(input)
running := true
for running {
fmt.Println("Introduzca término de búsqueda o q para salir:")
fmt.Scanf("%s", &searchTerm)
if searchTerm != "q" {
results, percentages := matrix.SearchDocumentsByWord(searchTerm)
fmt.Println("Resultados:")
for c, value := range results {
fmt.Printf("%d: %.2f%% - %s\n", c, percentages[c] * 10.0, sites[value])
}
} else {
running = false
}
}
}