Spaces:
Runtime error
Runtime error
File size: 4,851 Bytes
ba7deb1 fdc091a 2f591da ba7deb1 fdc091a ba7deb1 fdc091a ff76c3e fdc091a ba7deb1 d9a1859 ba7deb1 d5aa2d6 d9a1859 ba7deb1 7d5b5ca 7d97826 7d5b5ca ba7deb1 779c2fa 7d5b5ca e1aa772 31a4689 7d5b5ca 250025d fdc091a 7d5b5ca a5a27a1 d5aa2d6 a5a27a1 fdc091a 7d5b5ca ba7deb1 1abbec9 fb9efd9 1abbec9 fb9efd9 5d0ccb5 fdc091a ba7deb1 9ae78a1 ba7deb1 779c2fa 5d0ccb5 fdc091a 8f38904 779c2fa a5a27a1 d9a1859 ba7deb1 d9a1859 ba7deb1 fdc091a 779c2fa 8ba4837 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
import os
import re
import time
import markdown
import nomic
import numpy as np
import pandas as pd
from nomic import atlas
from nomic.dataset import AtlasClass
from nomic.data_inference import NomicTopicOptions
from src.my_logger import setup_logger
NOMIC_KEY = os.getenv('NOMIC_KEY')
nomic.login(NOMIC_KEY)
sleep_time = int(os.getenv('NOMIC_SLEEP_TIME', 60))
logger = setup_logger(__name__)
# Regex to extract subreddit
subreddit_re = re.compile(r'[^e]r/(\w+)')
def count_words(text):
words = text.split()
return len(words)
def preprocess_markdown(text):
# Inline CSS for spoilers
spoiler_style = 'background-color: black; color: black;'
hover_style = 'color: inherit;' # Assuming you want the text to be visible on hover
# Replace Reddit spoiler tags with an HTML span with inline styles
text = re.sub(
r'\>\!(.*?)\!\<',
r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
text
)
return text
def convert_markdown_to_html(text):
processed_text = preprocess_markdown(text)
html = markdown.markdown(processed_text, extensions=['mdx_linkify'])
return html
def extract_subreddit(text):
match = subreddit_re.search(text)
if match:
return 'r/' + match.group(1)
return ''
def delete_old_nomic():
logger.info(f"Trying to delete old version of nomic Atlas...")
try:
ac = AtlasClass()
atlas_id = ac._get_dataset_by_slug_identifier("derek2/boru-subreddit-neural-search")['id']
ac._delete_project_by_id(atlas_id)
logger.info(f"Succeeded in deleting old version of nomic Atlas.")
# Get sleep time from environment variable
logger.info(f"Sleeping for {sleep_time}s to wait for old version deletion on the server-side")
time.sleep(sleep_time)
except Exception as e:
logger.info(f"Failed to delete old version of nomic Atlas. Error: {e}")
def preprocess_markdown(text):
# Inline CSS for spoilers
spoiler_style = 'background-color: black; color: black;'
hover_style = 'color: inherit;' # Assuming you want the text to be visible on hover
# Replace Reddit spoiler tags >!spoiler!< with an HTML span with inline styles
text = re.sub(
r'\>\!(.*?)\<\!',
r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
text
)
return text
def build_nomic(dataset):
df = dataset['train'].to_pandas()
# For nomig: filter out rows that contain 'nsfw' in specified text columns or where 'nsfw' column is True
df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
lambda x: x.str.contains('nsfw', case=False, na=False)).any(axis=1) & ~df['nsfw']]
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
'score', 'score_percentile', 'html_content', 'subreddit']
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
# Ensure the bins are unique and include the maximum score
bins = sorted(set(percentiles + [df['score'].max()]))
# Define the labels for the percentile ranges
# The number of labels should be one less than the number of bins
labels = [int(i * 10) for i in range(len(bins) - 1)]
# Add a 'percentile_ranges' column to the DataFrame
# This assigns each score to its corresponding percentile range
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
df['word_count'] = df['content'].apply(count_words)
df['url'] = 'https://www.reddit.com' + df['permalink']
df['html_content'] = df['content'].apply(convert_markdown_to_html)
# Apply the function
df['subreddit'] = df['content'].apply(extract_subreddit)
topic_options = NomicTopicOptions(build_topic_model=True)
topic_options.topic_label_field = 'html_content'
delete_old_nomic()
# Create Atlas project
logger.info(f"Trying to create new version of Atlas...")
project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
data=df[non_embedding_columns].to_dict(orient='records'),
id_field='id',
identifier='BORU Subreddit Neural Search',
topic_model=topic_options
)
logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")
|