-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathKeywordExtractor.go
More file actions
176 lines (148 loc) · 5.03 KB
/
KeywordExtractor.go
File metadata and controls
176 lines (148 loc) · 5.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"sort"
"strings"
"golang.org/x/net/html"
)
// WordCount represents a word and its frequency
type WordCount struct {
Word string
Count int
}
// Common stop words to filter out
var stopWords = map[string]bool{
"a": true, "about": true, "above": true, "after": true, "again": true, "against": true, "all": true,
"am": true, "an": true, "and": true, "any": true, "are": true, "as": true, "at": true, "be": true,
"because": true, "been": true, "before": true, "being": true, "below": true, "between": true, "both": true,
"but": true, "by": true, "can": true, "did": true, "do": true, "does": true, "doing": true, "don": true,
"down": true, "during": true, "each": true, "few": true, "for": true, "from": true, "further": true,
"had": true, "has": true, "have": true, "having": true, "he": true, "her": true, "here": true, "hers": true,
"herself": true, "him": true, "himself": true, "his": true, "how": true, "i": true, "if": true, "in": true,
"into": true, "is": true, "it": true, "its": true, "itself": true, "just": true, "me": true, "more": true,
"most": true, "my": true, "myself": true, "no": true, "nor": true, "not": true, "now": true, "of": true,
"off": true, "on": true, "once": true, "only": true, "or": true, "other": true, "our": true, "ours": true,
"ourselves": true, "out": true, "over": true, "own": true, "same": true, "she": true, "should": true,
"so": true, "some": true, "such": true, "than": true, "that": true, "the": true, "their": true, "theirs": true,
"them": true, "themselves": true, "then": true, "there": true, "these": true, "they": true, "this": true,
"those": true, "through": true, "to": true, "too": true, "under": true, "until": true, "up": true,
"very": true, "was": true, "we": true, "were": true, "what": true, "when": true, "where": true, "which": true,
"while": true, "who": true, "whom": true, "why": true, "will": true, "with": true, "you": true, "your": true,
"yours": true, "yourself": true, "yourselves": true,
}
func main() {
if len(os.Args) != 3 {
fmt.Println("Usage: program <url> <number_of_keywords>")
os.Exit(1)
}
url := os.Args[1]
numKeywords := 0
fmt.Sscanf(os.Args[2], "%d", &numKeywords)
if numKeywords <= 0 {
fmt.Println("Number of keywords must be greater than 0")
os.Exit(1)
}
// Fetch the webpage content
content, err := fetchURL(url)
if err != nil {
fmt.Printf("Error fetching URL: %v\n", err)
os.Exit(1)
}
// Extract text from HTML
text, err := extractTextFromHTML(content)
if err != nil {
fmt.Printf("Error extracting text: %v\n", err)
os.Exit(1)
}
// Extract keywords
keywords := extractKeywords(text, numKeywords)
// Print results
fmt.Printf("Top %d keywords from %s:\n", numKeywords, url)
fmt.Println("----------------------------------")
for i, kw := range keywords {
fmt.Printf("%d. %s (%d occurrences)\n", i+1, kw.Word, kw.Count)
}
}
// fetchURL retrieves the content of a URL
func fetchURL(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP request failed with status code: %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(body), nil
}
// extractTextFromHTML extracts plain text from HTML content
func extractTextFromHTML(htmlContent string) (string, error) {
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
return "", err
}
var textParts []string
var extractText func(*html.Node)
extractText = func(n *html.Node) {
if n.Type == html.TextNode {
text := strings.TrimSpace(n.Data)
if text != "" {
textParts = append(textParts, text)
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
// Skip script and style elements
if c.Type == html.ElementNode && (c.Data == "script" || c.Data == "style") {
continue
}
extractText(c)
}
}
extractText(doc)
return strings.Join(textParts, " "), nil
}
// extractKeywords extracts and counts keywords from text
func extractKeywords(text string, numKeywords int) []WordCount {
// Convert to lowercase
text = strings.ToLower(text)
// Remove special characters and replace with spaces
re := regexp.MustCompile(`[^a-z0-9\s]`)
text = re.ReplaceAllString(text, " ")
// Split into words
words := strings.Fields(text)
// Count word frequencies
wordFreq := make(map[string]int)
for _, word := range words {
// Skip short words (likely not meaningful keywords)
if len(word) < 3 {
continue
}
// Skip common stop words
if stopWords[word] {
continue
}
wordFreq[word]++
}
// Convert map to slice for sorting
var wordCounts []WordCount
for word, count := range wordFreq {
wordCounts = append(wordCounts, WordCount{Word: word, Count: count})
}
// Sort by frequency (descending)
sort.Slice(wordCounts, func(i, j int) bool {
return wordCounts[i].Count > wordCounts[j].Count
})
// Return top N keywords
if len(wordCounts) > numKeywords {
wordCounts = wordCounts[:numKeywords]
}
return wordCounts
}