beans-outlier / prepare.py
MarkusStoll's picture
"labels"
67e49b4
import pickle
import datasets
import os
import umap
if __name__ == "__main__":
cache_file = "dataset_cache.pkl"
if os.path.exists(cache_file):
# Load dataset from cache
with open(cache_file, "rb") as file:
dataset = pickle.load(file)
print("Dataset loaded from cache.")
else:
# Load dataset using datasets.load_dataset()
ds = datasets.load_dataset("renumics/beans-outlier", split="train")
print("Dataset loaded using datasets.load_dataset().")
df = ds.to_pandas()
df["label_str"] = df["labels"].apply(lambda x: ds.features["labels"].int2str(x))
# df = df[:1000]
# precalculate umap embeddings
df["embedding_ft_precalc"] = umap.UMAP(
n_neighbors=70, min_dist=0.5, random_state=42
).fit_transform(df["embedding_ft"].tolist()).tolist()
print("Umap for ft done")
df["embedding_foundation_precalc"] = umap.UMAP(
n_neighbors=70, min_dist=0.5, random_state=42
).fit_transform(df["embedding_foundation"].tolist()).tolist()
print("Umap for base done")
# Save dataset to cache
with open(cache_file, "wb") as file:
pickle.dump(df, file)
print("Dataset saved to cache.")