package main import ( "fmt" "io/ioutil" "net/http" "regexp" "strings" "os" "tfidf" ) func getLinks(text, mainUrl string) []string { result := make([]string, 0) hrefLocator, _ := regexp.Compile("href=(\"|').*?(\"|')") elems := hrefLocator.FindAllString(text, -1) for _, link := range elems { temp := strings.Replace(link, "href=", "", -1) temp = strings.Replace(temp, "\"", "", -1) temp = strings.Replace(temp, "'", "", -1) switch { case strings.Contains(temp, ".png"): continue case strings.Contains(temp, ".jpg"): continue case strings.Contains(temp, ".gif"): continue case strings.Contains(temp, ".xml"): continue case strings.Contains(temp, ".css"): continue case strings.Contains(temp, ".js"): continue case !strings.Contains(temp, mainUrl): continue default: result = append(result, temp) } } return result } func fetchAndGetLinks(url string, output chan []string) { result := make([]string, 0) resp, err := http.Get(url) if err != nil { output <- result return } body, err := ioutil.ReadAll(resp.Body) defer resp.Body.Close() if err != nil { output <- result return } links := getLinks(string(body), url) output <- links } func fetchText(url string, output chan []string) { resp, err := http.Get(url) if err != nil { output <- nil return } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { output <- nil return } if strings.Contains(string(body), " 1 { for count := 1; count < len(os.Args); count++ { if strings.Contains(os.Args[count], option) { result = os.Args[count] result = strings.Replace(result, option, "", -1) result = strings.TrimSpace(result) if result == "" { result = os.Args[count + 1] result = strings.Replace(result, option, "", -1) result = strings.TrimSpace(result) } break } } } return result } func getWebsitesArgument() string { return getCommandLineArgument("-w", "websites.txt") } func getStopWordsArgument() string { return getCommandLineArgument("-s", "english1") } func main() { var searchTerm string var matrix tfidf.TfIdfMatrix fmt.Println("Init the TfIdf with stopwords") stopwordsLanguage := getStopWordsArgument() matrix.LoadStopWords(fmt.Sprintf("./src/tfidf/stop-words/stop-words-%s.txt", stopwordsLanguage)) websites := loadWebsites(getWebsitesArgument()) input, sites := getDocuments(websites) matrix.TfIdf(input) running := true for running { fmt.Println("Introduzca término de búsqueda o q para salir:") fmt.Scanf("%s", &searchTerm) if searchTerm != "q" { results, percentages := matrix.SearchDocumentsByWord(searchTerm) fmt.Println("Resultados:") for c, value := range results { fmt.Printf("%d: %.2f%% - %s\n", c, percentages[c] * 10.0, sites[value]) } } else { running = false } } }