File size: 4,851 Bytes
ba7deb1
 
fdc091a
2f591da
ba7deb1
fdc091a
ba7deb1
 
fdc091a
ff76c3e
fdc091a
 
ba7deb1
d9a1859
 
ba7deb1
 
d5aa2d6
d9a1859
ba7deb1
7d5b5ca
7d97826
7d5b5ca
ba7deb1
779c2fa
 
 
 
 
7d5b5ca
 
 
 
 
 
e1aa772
31a4689
7d5b5ca
 
 
 
 
 
 
 
250025d
fdc091a
 
 
7d5b5ca
 
 
 
 
 
 
a5a27a1
 
 
 
 
 
 
d5aa2d6
 
 
 
 
 
 
a5a27a1
fdc091a
7d5b5ca
 
 
 
 
 
 
 
 
 
 
 
 
 
ba7deb1
 
 
1abbec9
fb9efd9
1abbec9
fb9efd9
5d0ccb5
fdc091a
ba7deb1
 
 
 
 
 
 
 
 
 
 
 
 
9ae78a1
ba7deb1
779c2fa
5d0ccb5
fdc091a
 
 
 
 
8f38904
 
779c2fa
a5a27a1
d9a1859
ba7deb1
d9a1859
ba7deb1
 
 
 
fdc091a
779c2fa
8ba4837
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
import os
import re
import time

import markdown
import nomic
import numpy as np
import pandas as pd
from nomic import atlas
from nomic.dataset import AtlasClass
from nomic.data_inference import NomicTopicOptions

from src.my_logger import setup_logger

NOMIC_KEY = os.getenv('NOMIC_KEY')
nomic.login(NOMIC_KEY)
sleep_time = int(os.getenv('NOMIC_SLEEP_TIME', 60))
logger = setup_logger(__name__)

# Regex to extract subreddit
subreddit_re = re.compile(r'[^e]r/(\w+)')


def count_words(text):
    words = text.split()
    return len(words)


def preprocess_markdown(text):
    # Inline CSS for spoilers
    spoiler_style = 'background-color: black; color: black;'
    hover_style = 'color: inherit;'  # Assuming you want the text to be visible on hover

    # Replace Reddit spoiler tags with an HTML span with inline styles
    text = re.sub(
            r'\>\!(.*?)\!\<',
            r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
            text
            )
    return text


def convert_markdown_to_html(text):
    processed_text = preprocess_markdown(text)
    html = markdown.markdown(processed_text, extensions=['mdx_linkify'])
    return html


def extract_subreddit(text):
    match = subreddit_re.search(text)
    if match:
        return 'r/' + match.group(1)
    return ''


def delete_old_nomic():
    logger.info(f"Trying to delete old version of nomic Atlas...")
    try:
        ac = AtlasClass()
        atlas_id = ac._get_dataset_by_slug_identifier("derek2/boru-subreddit-neural-search")['id']
        ac._delete_project_by_id(atlas_id)
        logger.info(f"Succeeded in deleting old version of nomic Atlas.")
        
        # Get sleep time from environment variable
        logger.info(f"Sleeping for {sleep_time}s to wait for old version deletion on the server-side")
        time.sleep(sleep_time)
    except Exception as e:
        logger.info(f"Failed to delete old version of nomic Atlas. Error: {e}")



def preprocess_markdown(text):
    # Inline CSS for spoilers
    spoiler_style = 'background-color: black; color: black;'
    hover_style = 'color: inherit;'  # Assuming you want the text to be visible on hover

    # Replace Reddit spoiler tags >!spoiler!< with an HTML span with inline styles
    text = re.sub(
            r'\>\!(.*?)\<\!',
            r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
            text
            )
    return text


def build_nomic(dataset):
    df = dataset['train'].to_pandas()

    # For nomig: filter out rows that contain 'nsfw' in specified text columns or where 'nsfw' column is True
    df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
            lambda x: x.str.contains('nsfw', case=False, na=False)).any(axis=1) & ~df['nsfw']]

    non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
                             'score', 'score_percentile', 'html_content', 'subreddit']

    # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
    percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()

    # Ensure the bins are unique and include the maximum score
    bins = sorted(set(percentiles + [df['score'].max()]))

    # Define the labels for the percentile ranges
    # The number of labels should be one less than the number of bins
    labels = [int(i * 10) for i in range(len(bins) - 1)]

    # Add a 'percentile_ranges' column to the DataFrame
    # This assigns each score to its corresponding percentile range
    df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)

    df['word_count'] = df['content'].apply(count_words)
    df['url'] = 'https://www.reddit.com' + df['permalink']
    df['html_content'] = df['content'].apply(convert_markdown_to_html)

    # Apply the function
    df['subreddit'] = df['content'].apply(extract_subreddit)

    topic_options = NomicTopicOptions(build_topic_model=True)
    topic_options.topic_label_field = 'html_content'

    delete_old_nomic()

    # Create Atlas project
    logger.info(f"Trying to create new version of Atlas...")
    project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
                             data=df[non_embedding_columns].to_dict(orient='records'),
                             id_field='id',
                             identifier='BORU Subreddit Neural Search',
                             topic_model=topic_options
                             )
    logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")