package format
import (
"html"
"regexp"
"strings"
)
func SanitizeHTML(htmlContent string) string {
htmlContent = removeDangerousTags(htmlContent)
htmlContent = removeEventHandlers(htmlContent)
htmlContent = removeJavascriptProtocol(htmlContent)
htmlContent = sanitizeStyles(htmlContent)
return htmlContent
}
func removeDangerousTags(html string) string {
dangerousTags := []string{
"script", "iframe", "object", "embed", "applet",
"meta", "link", "base", "form", "input", "button",
}
for _, tag := range dangerousTags {
regex := regexp.MustCompile(`(?i)<` + tag + `[^>]*>[\s\S]*?` + tag + `>`)
html = regex.ReplaceAllString(html, "")
regex = regexp.MustCompile(`(?i)<` + tag + `[^>]*>`)
html = regex.ReplaceAllString(html, "")
}
return html
}
func removeEventHandlers(html string) string {
eventHandlers := regexp.MustCompile(`(?i)\s*on\w+\s*=\s*["'][^"']*["']`)
return eventHandlers.ReplaceAllString(html, "")
}
func removeJavascriptProtocol(html string) string {
jsProtocol := regexp.MustCompile(`(?i)javascript:`)
return jsProtocol.ReplaceAllString(html, "")
}
func sanitizeStyles(html string) string {
dangerousStyles := []string{"behavior", "expression", "binding", "import", "moz-binding"}
for _, style := range dangerousStyles {
regex := regexp.MustCompile(`(?i)` + style + `\s*:\s*[^;]+;?`)
html = regex.ReplaceAllString(html, "")
}
return html
}
func GenerateSnippet(bodyText, bodyHTML string) string {
text := bodyText
if text == "" && bodyHTML != "" {
text = StripHTML(bodyHTML)
}
text = strings.TrimSpace(text)
if len(text) > 150 {
text = text[:150] + "..."
}
return text
}
func StripHTML(html string) string {
text := html
styleRegex := regexp.MustCompile(`(?i)`)
text = styleRegex.ReplaceAllString(text, "")
scriptRegex := regexp.MustCompile(`(?i)`)
text = scriptRegex.ReplaceAllString(text, "")
headRegex := regexp.MustCompile(`(?i)
]*>[\s\S]*?`)
text = headRegex.ReplaceAllString(text, "")
text = strings.ReplaceAll(text, "
", "\n")
text = strings.ReplaceAll(text, "
", "\n")
text = strings.ReplaceAll(text, "
", "\n")
text = strings.ReplaceAll(text, "", "\n\n")
text = strings.ReplaceAll(text, "", "\n")
text = strings.ReplaceAll(text, "", "\n")
text = strings.ReplaceAll(text, "", "\n")
text = strings.ReplaceAll(text, "", "\n")
text = strings.ReplaceAll(text, "", "\n")
text = strings.ReplaceAll(text, "", "\n")
inTag := false
var result strings.Builder
for _, char := range text {
if char == '<' {
inTag = true
continue
}
if char == '>' {
inTag = false
continue
}
if !inTag {
result.WriteRune(char)
}
}
cleanText := result.String()
lines := strings.Split(cleanText, "\n")
var cleanLines []string
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
cleanLines = append(cleanLines, line)
}
}
return strings.TrimSpace(strings.Join(cleanLines, " "))
}
func HTMLToPlainText(htmlContent string) string {
text := htmlContent
text = regexp.MustCompile(`(?i)`).ReplaceAllString(text, "")
text = regexp.MustCompile(`(?i)`).ReplaceAllString(text, "")
text = regexp.MustCompile(`(?i)]*>[\s\S]*?`).ReplaceAllString(text, "")
text = regexp.MustCompile(`(?i)]*>[\s\S]*?`).ReplaceAllString(text, "")
text = regexp.MustCompile(`(?i)
`).ReplaceAllString(text, "\n")
text = regexp.MustCompile(`(?i)`).ReplaceAllString(text, "\n\n")
text = regexp.MustCompile(`(?i)`).ReplaceAllString(text, "\n")
text = regexp.MustCompile(`(?i)`).ReplaceAllString(text, "\n")
text = regexp.MustCompile(`(?i)`).ReplaceAllString(text, "\n\n")
text = regexp.MustCompile(`(?i)`).ReplaceAllString(text, "\n")
text = regexp.MustCompile(`<[^>]+>`).ReplaceAllString(text, "")
text = strings.ReplaceAll(text, " ", " ")
text = strings.ReplaceAll(text, "<", "<")
text = strings.ReplaceAll(text, ">", ">")
text = strings.ReplaceAll(text, "&", "&")
text = strings.ReplaceAll(text, """, "\"")
text = strings.ReplaceAll(text, "'", "'")
text = strings.ReplaceAll(text, "'", "'")
text = regexp.MustCompile(`\n\s*\n\s*\n+`).ReplaceAllString(text, "\n\n")
text = regexp.MustCompile(`[ \t]+`).ReplaceAllString(text, " ")
lines := strings.Split(text, "\n")
var cleanLines []string
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if trimmed != "" {
cleanLines = append(cleanLines, trimmed)
}
}
return strings.TrimSpace(strings.Join(cleanLines, "\n"))
}
func DecodeHTML(text string) string {
return html.UnescapeString(text)
}