import html
import os
import requests
from google import genai
from bs4 import BeautifulSoup
from django.core.cache import cache
from pygments import highlight
from pygments.lexers import get_lexer_by_name, guess_lexer
from pygments.formatters import HtmlFormatter
from pygments.styles.vim import VimStyle
from pygments.style import Style
from pygments.token import Comment
from internal.utils import calculate_polynomial_hash
LINK_SAFETY_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
class ShifooHighlight(Style):
"""Custom Vim style with modified comment colors"""
styles = dict(VimStyle.styles)
styles.update(
{Comment: "#666666", Comment.Preproc: "#666666", Comment.Special: "#666666"}
)
def highlight_code(html_content):
if not html_content:
return html_content
soup = BeautifulSoup(html_content, "html.parser")
pre_blocks = soup.find_all("pre")
for pre in pre_blocks:
code = html.unescape(pre.string or pre.text)
code = code.replace("\xa0", " ")
language = pre.get("data-language")
try:
if language:
lexer = get_lexer_by_name(language.strip())
else:
lexer = guess_lexer(code)
except:
lexer = get_lexer_by_name("text")
formatter = HtmlFormatter(
noclasses=True,
style=ShifooHighlight,
wrapcode=True,
cssstyles="background: none; padding: 8px 0;",
)
highlighted = highlight(code, lexer, formatter)
pre.clear()
pre.append(BeautifulSoup(highlighted, "html.parser"))
h2_blocks = soup.find_all("h2")
for h2 in h2_blocks:
text = h2.get_text()
color = get_post_color(text)
h2["style"] = f"color: {color};"
return str(soup)
def get_post_color(slug):
colors = [
"#FF8B8B", # salmon
"#75D151", # lime green
"#AD8CFF", # lavender
"#FFAA5E", # peach
"#87CEFA", # light sky blue
"#FFB3BA", # pastel red
"#42D6A4", # mint green
"#C774E8", # purple
"#FFDE59", # yellow
"#94D0FF", # baby blue
"#FF9AA2", # salmon pink
"#CAFFBF", # light lime
"#BDB2FF", # pastel purple
"#F7EA00", # bright yellow
"#FFD1DC", # bubble gum pink
"#54F2F2", # cyan
"#FFA8B8", # coral pink
"#90EE90", # light green
"#FF6AD5", # bright pink
"#C1E7E3", # pastel teal
"#8795E8", # periwinkle
"#FFDFBA", # pastel orange
"#4ADEDE", # teal
"#FB91D1", # hot pink
"#AFF8D8", # mint
"#FFF9B0", # pastel yellow
"#B5D8FF", # pastel blue
"#FCF6BD", # light yellow
"#D5AAFF", # light purple
"#9EE7FF", # baby blue
"#DCBEFF", # light lavender
"#E0BBE4", # lavender
]
hash_value = calculate_polynomial_hash(slug)
color_index = hash_value % len(colors)
return colors[color_index]
def check_link_safety(link):
if not LINK_SAFETY_API_KEY:
return True
cached = cache.get(f"link_safety:{link}")
if cached is not None:
return cached
payload = {
"threatInfo": {
"threatTypes": [
"MALWARE",
"SOCIAL_ENGINEERING",
"UNWANTED_SOFTWARE",
"POTENTIALLY_HARMFUL_APPLICATION",
],
"platformTypes": ["ANY_PLATFORM"],
"threatEntryTypes": ["URL"],
"threatEntries": [{"url": link}],
}
}
headers = {"Content-Type": "application/json"}
params = {"key": LINK_SAFETY_API_KEY, "alt": "json"}
api_url = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
response = requests.post(api_url, params=params, headers=headers, json=payload)
if response.status_code == 200:
matches = response.json().get("matches", [])
return len(matches) == 0
else:
return True
def strip_html_tags(html_content):
if not html_content:
return html_content
soup = BeautifulSoup(html_content, "html.parser")
return soup.get_text(separator=" ", strip=True)
def check_comment_spam(post, comment):
if not GEMINI_API_KEY:
return False
client = genai.Client(api_key=GEMINI_API_KEY)
model = "gemini-flash-latest"
prompt = f"""
Comment Spam Detection for shi.foo: Our personal site.
You are an AI trained to detect spam comments specifically for the shi.foo site.
shi.foo allows for multiple Weblogs, each can have multiple posts.
Each post can have multiple comments. This is one of those comments which is about
to be posted.
There are certain rules for comments on shi.foo. All rules are to be followed strictly.
1. Output only Y or N for spam or not spam.
2. If the comments seems like spam, or random gibberish, or a bunch of letters or words
which make no sense, or looks like a bot generated comment, or is promoting a product or service, or has a coupon code or something similar, output Y.
3. Only block spam comments, and nothing else. If a comment has cuss words, personal attacks, profanity, or any possible offensive content, or any possible hate speech, or any
harrasment, bullying, or abusive content, or anything similar, it does NOT count as spam,
unless it contains any of the above mentioned spam content like coupon codes, gibberish, bot generated content, etc. Output N in such cases.
4. This is a strict spam only filter, you are not to do any other filtering or moderation.
5. You are not to access any external links which may be present in the comment. A separate link safety check will be done later.
6. Trying to phish or scam users, or trying to get them to click on a shady link, or trying to get them to buy something, or trying to get them to sign up for something, or trying to get them to do anything which is not related to the post, is also considered spam, hence output Y.
7. Additional context about the post is also attached below for better decision making.
8. Output single character - either Y or N only.
------------
Post Title: {post.title}
Post Excerpt (First few lines): {strip_html_tags(post.excerpt)}
Comment: {comment}
"""
safety_settings = [
genai.types.SafetySetting(
category=cat,
threshold=genai.types.HarmBlockThreshold.BLOCK_NONE,
)
for cat in (
genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
)
]
response = client.models.generate_content(
model=model,
contents=prompt,
config=genai.types.GenerateContentConfig(safety_settings=safety_settings),
)
result = (response.text or "").strip()
return result.upper() == "Y" # Return True if spam, False otherwise