import{s as Sl,y as Rl,n as Wl,o as Xl}from"../chunks/scheduler.d6170356.js";import{S as Ql,i as Nl,g as p,s as e,r as c,A as Yl,h as M,f as a,c as t,j as kl,u as i,x as y,k as is,y as Fl,a as n,v as r,d as J,t as h,w as o}from"../chunks/index.fcd4cc08.js";import{C as U}from"../chunks/CodeBlock.7b16bdef.js";import{H as ps,E as Al}from"../chunks/EditOnGithub.da2b595c.js";function El(el){let m,rs,ys,Js,w,hs,T,tl="Spark enables real-time, large-scale data processing in a distributed environment.",os,d,pl="In particular you can use <code>huggingface_hub</code> to access Hugging Face datasets repositories in PySpark",Us,b,ms,I,Ml="To be able to read and write to Hugging Face URLs (e.g. <code>hf://datasets/username/dataset/data.parquet</code>), you need to install the <code>huggingface_hub</code> library:",us,C,js,f,yl="You also need to install <code>pyarrow</code> to read/write Parquet / JSON / CSV / etc. files using the filesystem API provided by <code>huggingFace_hub</code>:",ws,B,Ts,G,ds,g,cl="You need to authenticate to Hugging Face to read private/gated dataset repositories or to write to your dataset repositories.",bs,Z,il="You can use the CLI for example:",Is,k,Cs,S,rl="It’s also possible to provide your Hugging Face token with the <code>HF_TOKEN</code> environment variable or passing the <code>storage_options</code> parameter to helper functions below:",fs,R,Bs,W,Jl='For more details about authentication, check out <a href="https://huggingface.co/docs/huggingface_hub/quick-start#authentication" rel="nofollow">this guide</a>.',Gs,X,gs,Q,hl="PySpark doesn’t have an official support for Hugging Face paths, so we provide a helper function to read datasets in a distributed manner.",Zs,N,ol="For example you can read Parquet files from Hugging Face in an optimized way using PyArrow by defining this <code>read_parquet</code> helper function:",ks,Y,Ss,F,Ul=`Here is how we can use this on the <a href="https://huggingface.co/datasets/BAAI/Infinity-Instruct" rel="nofollow">BAAI/Infinity-Instruct</a> dataset.
It is a gated repository, users have to accept the terms of use before accessing it.`,Rs,u,ml='<img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-spark-infinity-instruct-7M-min.png"/> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-spark-infinity-instruct-7M-dark-min.png"/>',Ws,A,ul="We use the <code>read_parquet</code> function to read data from the dataset, compute the number of dialogue per language and filter the dataset.",Xs,E,jl="After logging-in to access the gated repository, we can run:",Qs,V,Ns,v,wl=`To compute the number of dialogues per language we run this code.
The <code>columns</code> argument is useful to only load the data we need, since PySpark doesn’t enable predicate push-down in this case.
There is also a <code>filters</code> argument to only load data with values within a certain range.`,Ys,z,Fs,_,Tl="To filter the dataset and only keep dialogues in Chinese:",As,q,Es,H,Vs,x,dl="Once you have your PySpark Dataframe ready, you can run SQL queries using <code>spark.sql</code>:",vs,$,zs,L,_s,D,bl="We also provide a helper function to write datasets in a distributed manner to a Hugging Face repository.",qs,P,Il=`You can write a PySpark Dataframe to Hugging Face using this <code>write_parquet</code> helper function based on the <code>huggingface_hub</code> API.
In particular it uses the <code>preupload_lfs_files</code> utility to upload Parquet files in parallel in a distributed manner, and only commits the files once they’re all uploaded:`,Hs,K,xs,O,Cl='Here is how we can use this function to write the filtered version of the <a href="https://huggingface.co/datasets/BAAI/Infinity-Instruct" rel="nofollow">BAAI/Infinity-Instruct</a> dataset back to Hugging Face.',$s,ss,fl=`First you need to <a href="https://huggingface.co/new-dataset" rel="nofollow">create a dataset repository</a>, e.g. <code>username/Infinity-Instruct-Chinese-Only</code> (you can set it to private if you want).
Then, make sure you are authenticated and you can run:`,Ls,ls,Ds,j,Bl='<img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-spark-infinity-instruct-chinese-only-min.png"/> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-spark-infinity-instruct-chinese-only-dark-min.png"/>',Ps,as,Ks,ns,Gl='You can duplicate the <a href="https://huggingface.co/spaces/lhoestq/Spark-on-HF-JupyterLab" rel="nofollow">Spark on HF JupyterLab</a> Space to get a Notebook with PySpark and those helper functions pre-installed.',Os,es,gl="Click on “Duplicate Space”, choose a name for your Space, select your hardware and you are ready:",sl,Ms,Zl,ll,ts,al,cs,nl;return w=new ps({props:{title:"Spark",local:"spark",headingTag:"h1"}}),b=new ps({props:{title:"Installation",local:"installation",headingTag:"h2"}}),C=new U({props:{code:"cGlwJTIwaW5zdGFsbCUyMGh1Z2dpbmdmYWNlX2h1Yg==",highlighted:'pip <span class="hljs-keyword">install</span> huggingface_hub',wrap:!1}}),B=new U({props:{code:"cGlwJTIwaW5zdGFsbCUyMHB5YXJyb3c=",highlighted:'pip <span class="hljs-keyword">install</span> pyarrow',wrap:!1}}),G=new ps({props:{title:"Authentication",local:"authentication",headingTag:"h2"}}),k=new U({props:{code:"aHVnZ2luZ2ZhY2UtY2xpJTIwbG9naW4=",highlighted:'huggingface-<span class="hljs-keyword">cli</span> login',wrap:!1}}),R=new U({props:{code:"c3RvcmFnZV9vcHRpb25zJTIwJTNEJTIwJTdCJTIydG9rZW4lMjIlM0ElMjAlMjJoZl94eHglMjIlN0Q=",highlighted:'storage_options = {<span class="hljs-string">&quot;token&quot;</span>: <span class="hljs-string">&quot;hf_xxx&quot;</span>}',wrap:!1}}),X=new ps({props:{title:"Read",local:"read",headingTag:"h2"}}),Y=new U({props:{code:"ZnJvbSUyMGZ1bmN0b29scyUyMGltcG9ydCUyMHBhcnRpYWwlMEFmcm9tJTIwdHlwaW5nJTIwaW1wb3J0JTIwSXRlcmF0b3IlMkMlMjBPcHRpb25hbCUyQyUyMFVuaW9uJTBBJTBBaW1wb3J0JTIwcHlhcnJvdyUyMGFzJTIwcGElMEFpbXBvcnQlMjBweWFycm93LnBhcnF1ZXQlMjBhcyUyMHBxJTBBZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMEhmRmlsZVN5c3RlbSUwQWZyb20lMjBweXNwYXJrLnNxbC5kYXRhZnJhbWUlMjBpbXBvcnQlMjBEYXRhRnJhbWUlMEFmcm9tJTIwcHlzcGFyay5zcWwucGFuZGFzLnR5cGVzJTIwaW1wb3J0JTIwZnJvbV9hcnJvd19zY2hlbWElMEElMEElMEFkZWYlMjBfcmVhZChpdGVyYXRvciUzQSUyMEl0ZXJhdG9yJTVCcGEuUmVjb3JkQmF0Y2glNUQlMkMlMjBjb2x1bW5zJTNBJTIwT3B0aW9uYWwlNUJsaXN0JTVCc3RyJTVEJTVEJTJDJTIwZmlsdGVycyUzQSUyME9wdGlvbmFsJTVCVW5pb24lNUJsaXN0JTVCdHVwbGUlNUQlMkMlMjBsaXN0JTVCbGlzdCU1QnR1cGxlJTVEJTVEJTVEJTVEJTJDJTIwKiprd2FyZ3MpJTIwLSUzRSUyMEl0ZXJhdG9yJTVCcGEuUmVjb3JkQmF0Y2glNUQlM0ElMEElMjAlMjAlMjAlMjBmb3IlMjBiYXRjaCUyMGluJTIwaXRlcmF0b3IlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwYXRocyUyMCUzRCUyMGJhdGNoJTVCMCU1RC50b19weWxpc3QoKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGRzJTIwJTNEJTIwcHEuUGFycXVldERhdGFzZXQocGF0aHMlMkMlMjAqKmt3YXJncyklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB5aWVsZCUyMGZyb20lMjBkcy5fZGF0YXNldC50b19iYXRjaGVzKGNvbHVtbnMlM0Rjb2x1bW5zJTJDJTIwZmlsdGVyJTNEcHEuZmlsdGVyc190b19leHByZXNzaW9uKGZpbHRlcnMpJTIwaWYlMjBmaWx0ZXJzJTIwZWxzZSUyME5vbmUpJTBBJTBBJTBBZGVmJTIwcmVhZF9wYXJxdWV0KCUwQSUyMCUyMCUyMCUyMHBhdGglM0ElMjBzdHIlMkMlMEElMjAlMjAlMjAlMjBjb2x1bW5zJTNBJTIwT3B0aW9uYWwlNUJsaXN0JTVCc3RyJTVEJTVEJTIwJTNEJTIwTm9uZSUyQyUwQSUyMCUyMCUyMCUyMGZpbHRlcnMlM0ElMjBPcHRpb25hbCU1QlVuaW9uJTVCbGlzdCU1QnR1cGxlJTVEJTJDJTIwbGlzdCU1Qmxpc3QlNUJ0dXBsZSU1RCU1RCU1RCU1RCUyMCUzRCUyME5vbmUlMkMlMEElMjAlMjAlMjAlMjAqKmt3YXJncyUyQyUwQSklMjAtJTNFJTIwRGF0YUZyYW1lJTNBJTBBJTIwJTIwJTIwJTIwJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwTG9hZHMlMjBQYXJxdWV0JTIwZmlsZXMlMjBmcm9tJTIwSHVnZ2luZyUyMEZhY2UlMjB1c2luZyUyMFB5QXJyb3clMkMlMjByZXR1cm5pbmclMjBhJTIwUHlTUGFyayUyMCU2MERhdGFGcmFtZSU2MC4lMEElMEElMjAlMjAlMjAlMjBJdCUyMHJlYWRzJTIwUGFycXVldCUyMGZpbGVzJTIwaW4lMjBhJTIwZGlzdHJpYnV0ZWQlMjBtYW5uZXIuJTBBJTBBJTIwJTIwJTIwJTIwQWNjZXNzJTIwcHJpdmF0ZSUyMG9yJTIwZ2F0ZWQlMjByZXBvc2l0b3JpZXMlMjB1c2luZyUyMCU2MGh1Z2dpbmdmYWNlLWNsaSUyMGxvZ2luJTYwJTIwb3IlMjBwYXNzaW5nJTIwYSUyMHRva2VuJTBBJTIwJTIwJTIwJTIwdXNpbmclMjB0aGUlMjAlNjBzdG9yYWdlX29wdGlvbnMlNjAlMjBhcmd1bWVudCUzQSUyMCU2MHN0b3JhZ2Vfb3B0aW9ucyUzRCU3QiUyMnRva2VuJTIyJTNBJTIwJTIyaGZfeHh4JTIyJTdEJTYwJTBBJTBBJTIwJTIwJTIwJTIwUGFyYW1ldGVycyUwQSUyMCUyMCUyMCUyMC0tLS0tLS0tLS0lMEElMjAlMjAlMjAlMjBwYXRoJTIwJTNBJTIwc3RyJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwUGF0aCUyMHRvJTIwdGhlJTIwZmlsZS4lMjBQcmVmaXglMjB3aXRoJTIwYSUyMHByb3RvY29sJTIwbGlrZSUyMCU2MGhmJTNBJTJGJTJGJTYwJTIwdG8lMjByZWFkJTIwZnJvbSUyMEh1Z2dpbmclMjBGYWNlLiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMFlvdSUyMGNhbiUyMHJlYWQlMjBmcm9tJTIwbXVsdGlwbGUlMjBmaWxlcyUyMGlmJTIweW91JTIwcGFzcyUyMGElMjBnbG9ic3RyaW5nLiUwQSUyMCUyMCUyMCUyMGNvbHVtbnMlMjAlM0ElMjBsaXN0JTJDJTIwZGVmYXVsdCUyME5vbmUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBJZiUyMG5vdCUyME5vbmUlMkMlMjBvbmx5JTIwdGhlc2UlMjBjb2x1bW5zJTIwd2lsbCUyMGJlJTIwcmVhZCUyMGZyb20lMjB0aGUlMjBmaWxlLiUwQSUyMCUyMCUyMCUyMGZpbHRlcnMlMjAlM0ElMjBMaXN0JTVCVHVwbGUlNUQlMjBvciUyMExpc3QlNUJMaXN0JTVCVHVwbGUlNUQlNUQlMkMlMjBkZWZhdWx0JTIwTm9uZSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMFRvJTIwZmlsdGVyJTIwb3V0JTIwZGF0YS4lMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBGaWx0ZXIlMjBzeW50YXglM0ElMjAlNUIlNUIoY29sdW1uJTJDJTIwb3AlMkMlMjB2YWwpJTJDJTIwLi4uJTVEJTJDLi4uJTVEJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwd2hlcmUlMjBvcCUyMGlzJTIwJTVCJTNEJTNEJTJDJTIwJTNEJTJDJTIwJTNFJTJDJTIwJTNFJTNEJTJDJTIwJTNDJTJDJTIwJTNDJTNEJTJDJTIwISUzRCUyQyUyMGluJTJDJTIwbm90JTIwaW4lNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjBpbm5lcm1vc3QlMjB0dXBsZXMlMjBhcmUlMjB0cmFuc3Bvc2VkJTIwaW50byUyMGElMjBzZXQlMjBvZiUyMGZpbHRlcnMlMjBhcHBsaWVkJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdGhyb3VnaCUyMGFuJTIwJTYwQU5EJTYwJTIwb3BlcmF0aW9uLiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMFRoZSUyMG91dGVyJTIwbGlzdCUyMGNvbWJpbmVzJTIwdGhlc2UlMjBzZXRzJTIwb2YlMjBmaWx0ZXJzJTIwdGhyb3VnaCUyMGFuJTIwJTYwT1IlNjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBvcGVyYXRpb24uJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwQSUyMHNpbmdsZSUyMGxpc3QlMjBvZiUyMHR1cGxlcyUyMGNhbiUyMGFsc28lMjBiZSUyMHVzZWQlMkMlMjBtZWFuaW5nJTIwdGhhdCUyMG5vJTIwJTYwT1IlNjAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBvcGVyYXRpb24lMjBiZXR3ZWVuJTIwc2V0JTIwb2YlMjBmaWx0ZXJzJTIwaXMlMjB0byUyMGJlJTIwY29uZHVjdGVkLiUwQSUwQSUyMCUyMCUyMCUyMCoqa3dhcmdzJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwQW55JTIwYWRkaXRpb25hbCUyMGt3YXJncyUyMGFyZSUyMHBhc3NlZCUyMHRvJTIwcHlhcnJvdy5wYXJxdWV0LlBhcnF1ZXREYXRhc2V0LiUwQSUwQSUyMCUyMCUyMCUyMFJldHVybnMlMEElMjAlMjAlMjAlMjAtLS0tLS0tJTBBJTIwJTIwJTIwJTIwRGF0YUZyYW1lJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwRGF0YUZyYW1lJTIwYmFzZWQlMjBvbiUyMHBhcnF1ZXQlMjBmaWxlLiUwQSUwQSUyMCUyMCUyMCUyMEV4YW1wbGVzJTBBJTIwJTIwJTIwJTIwLS0tLS0tLS0lMEElMjAlMjAlMjAlMjAlM0UlM0UlM0UlMjBwYXRoJTIwJTNEJTIwJTIyaGYlM0ElMkYlMkZkYXRhc2V0cyUyRnVzZXJuYW1lJTJGZGF0YXNldCUyRmRhdGEucGFycXVldCUyMiUwQSUyMCUyMCUyMCUyMCUzRSUzRSUzRSUyMHBkLkRhdGFGcmFtZSglN0IlMjJmb28lMjIlM0ElMjByYW5nZSg1KSUyQyUyMCUyMmJhciUyMiUzQSUyMHJhbmdlKDUlMkMlMjAxMCklN0QpLnRvX3BhcnF1ZXQocGF0aCklMEElMjAlMjAlMjAlMjAlM0UlM0UlM0UlMjByZWFkX3BhcnF1ZXQocGF0aCkuc2hvdygpJTBBJTIwJTIwJTIwJTIwJTJCLS0tJTJCLS0tJTJCJTBBJTIwJTIwJTIwJTIwJTdDZm9vJTdDYmFyJTdDJTBBJTIwJTIwJTIwJTIwJTJCLS0tJTJCLS0tJTJCJTBBJTIwJTIwJTIwJTIwJTdDJTIwJTIwMCU3QyUyMCUyMDUlN0MlMEElMjAlMjAlMjAlMjAlN0MlMjAlMjAxJTdDJTIwJTIwNiU3QyUwQSUyMCUyMCUyMCUyMCU3QyUyMCUyMDIlN0MlMjAlMjA3JTdDJTBBJTIwJTIwJTIwJTIwJTdDJTIwJTIwMyU3QyUyMCUyMDglN0MlMEElMjAlMjAlMjAlMjAlN0MlMjAlMjA0JTdDJTIwJTIwOSU3QyUwQSUyMCUyMCUyMCUyMCUyQi0tLSUyQi0tLSUyQiUwQSUyMCUyMCUyMCUyMCUzRSUzRSUzRSUyMHJlYWRfcGFycXVldChwYXRoJTJDJTIwY29sdW1ucyUzRCU1QiUyMmJhciUyMiU1RCkuc2hvdygpJTBBJTIwJTIwJTIwJTIwJTJCLS0tJTJCJTBBJTIwJTIwJTIwJTIwJTdDYmFyJTdDJTBBJTIwJTIwJTIwJTIwJTJCLS0tJTJCJTBBJTIwJTIwJTIwJTIwJTdDJTIwJTIwNSU3QyUwQSUyMCUyMCUyMCUyMCU3QyUyMCUyMDYlN0MlMEElMjAlMjAlMjAlMjAlN0MlMjAlMjA3JTdDJTBBJTIwJTIwJTIwJTIwJTdDJTIwJTIwOCU3QyUwQSUyMCUyMCUyMCUyMCU3QyUyMCUyMDklN0MlMEElMjAlMjAlMjAlMjAlMkItLS0lMkIlMEElMjAlMjAlMjAlMjAlM0UlM0UlM0UlMjBzZWwlMjAlM0QlMjAlNUIoJTIyZm9vJTIyJTJDJTIwJTIyJTNFJTIyJTJDJTIwMiklNUQlMEElMjAlMjAlMjAlMjAlM0UlM0UlM0UlMjByZWFkX3BhcnF1ZXQocGF0aCUyQyUyMGZpbHRlcnMlM0RzZWwpLnNob3coKSUwQSUyMCUyMCUyMCUyMCUyQi0tLSUyQi0tLSUyQiUwQSUyMCUyMCUyMCUyMCU3Q2ZvbyU3Q2JhciU3QyUwQSUyMCUyMCUyMCUyMCUyQi0tLSUyQi0tLSUyQiUwQSUyMCUyMCUyMCUyMCU3QyUyMCUyMDMlN0MlMjAlMjA4JTdDJTBBJTIwJTIwJTIwJTIwJTdDJTIwJTIwNCU3QyUyMCUyMDklN0MlMEElMjAlMjAlMjAlMjAlMkItLS0lMkItLS0lMkIlMEElMjAlMjAlMjAlMjAlMjIlMjIlMjIlMEElMjAlMjAlMjAlMjBmaWxlc3lzdGVtJTNBJTIwSGZGaWxlU3lzdGVtJTIwJTNEJTIwa3dhcmdzLnBvcCglMjJmaWxlc3lzdGVtJTIyKSUyMGlmJTIwJTIyZmlsZXN5c3RlbSUyMiUyMGluJTIwa3dhcmdzJTIwZWxzZSUyMEhmRmlsZVN5c3RlbSgqKmt3YXJncy5wb3AoJTIyc3RvcmFnZV9vcHRpb25zJTIyJTJDJTIwJTdCJTdEKSklMEElMjAlMjAlMjAlMjBwYXRocyUyMCUzRCUyMGZpbGVzeXN0ZW0uZ2xvYihwYXRoKSUwQSUyMCUyMCUyMCUyMGlmJTIwbm90JTIwcGF0aHMlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByYWlzZSUyMEZpbGVOb3RGb3VuZEVycm9yKGYlMjJDb3VubGRuJ3QlMjBmaW5kJTIwYW55JTIwZmlsZSUyMGF0JTIwJTdCcGF0aCU3RCUyMiklMEElMjAlMjAlMjAlMjByZGQlMjAlM0QlMjBzcGFyay5zcGFya0NvbnRleHQucGFyYWxsZWxpemUoJTVCJTdCJTIycGF0aCUyMiUzQSUyMHBhdGglN0QlMjBmb3IlMjBwYXRoJTIwaW4lMjBwYXRocyU1RCUyQyUyMGxlbihwYXRocykpJTBBJTIwJTIwJTIwJTIwZGYlMjAlM0QlMjBzcGFyay5jcmVhdGVEYXRhRnJhbWUocmRkKSUwQSUyMCUyMCUyMCUyMGFycm93X3NjaGVtYSUyMCUzRCUyMHBxLnJlYWRfc2NoZW1hKGZpbGVzeXN0ZW0ub3BlbihwYXRocyU1QjAlNUQpKSUwQSUyMCUyMCUyMCUyMHNjaGVtYSUyMCUzRCUyMHBhLnNjaGVtYSglNUJmaWVsZCUyMGZvciUyMGZpZWxkJTIwaW4lMjBhcnJvd19zY2hlbWElMjBpZiUyMChjb2x1bW5zJTIwaXMlMjBOb25lJTIwb3IlMjBmaWVsZC5uYW1lJTIwaW4lMjBjb2x1bW5zKSU1RCUyQyUyMG1ldGFkYXRhJTNEYXJyb3dfc2NoZW1hLm1ldGFkYXRhKSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMGRmLm1hcEluQXJyb3coJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcGFydGlhbChfcmVhZCUyQyUyMGNvbHVtbnMlM0Rjb2x1bW5zJTJDJTIwZmlsdGVycyUzRGZpbHRlcnMlMkMlMjBmaWxlc3lzdGVtJTNEZmlsZXN5c3RlbSUyQyUyMHNjaGVtYSUzRGFycm93X3NjaGVtYSUyQyUyMCoqa3dhcmdzKSUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGZyb21fYXJyb3dfc2NoZW1hKHNjaGVtYSklMkMlMEElMjAlMjAlMjAlMjAp",highlighted:`<span class="hljs-keyword">from</span> functools <span class="hljs-keyword">import</span> partial
<span class="hljs-keyword">from</span> typing <span class="hljs-keyword">import</span> Iterator, <span class="hljs-type">Optional</span>, <span class="hljs-type">Union</span>

<span class="hljs-keyword">import</span> pyarrow <span class="hljs-keyword">as</span> pa
<span class="hljs-keyword">import</span> pyarrow.parquet <span class="hljs-keyword">as</span> pq
<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> HfFileSystem
<span class="hljs-keyword">from</span> pyspark.sql.dataframe <span class="hljs-keyword">import</span> DataFrame
<span class="hljs-keyword">from</span> pyspark.sql.pandas.types <span class="hljs-keyword">import</span> from_arrow_schema


<span class="hljs-keyword">def</span> <span class="hljs-title function_">_read</span>(<span class="hljs-params">iterator: Iterator[pa.RecordBatch], columns: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">list</span>[<span class="hljs-built_in">str</span>]], filters: <span class="hljs-type">Optional</span>[<span class="hljs-type">Union</span>[<span class="hljs-built_in">list</span>[<span class="hljs-built_in">tuple</span>], <span class="hljs-built_in">list</span>[<span class="hljs-built_in">list</span>[<span class="hljs-built_in">tuple</span>]]]], **kwargs</span>) -&gt; Iterator[pa.RecordBatch]:
    <span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> iterator:
        paths = batch[<span class="hljs-number">0</span>].to_pylist()
        ds = pq.ParquetDataset(paths, **kwargs)
        <span class="hljs-keyword">yield</span> <span class="hljs-keyword">from</span> ds._dataset.to_batches(columns=columns, <span class="hljs-built_in">filter</span>=pq.filters_to_expression(filters) <span class="hljs-keyword">if</span> filters <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span>)


<span class="hljs-keyword">def</span> <span class="hljs-title function_">read_parquet</span>(<span class="hljs-params">
    path: <span class="hljs-built_in">str</span>,
    columns: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">list</span>[<span class="hljs-built_in">str</span>]] = <span class="hljs-literal">None</span>,
    filters: <span class="hljs-type">Optional</span>[<span class="hljs-type">Union</span>[<span class="hljs-built_in">list</span>[<span class="hljs-built_in">tuple</span>], <span class="hljs-built_in">list</span>[<span class="hljs-built_in">list</span>[<span class="hljs-built_in">tuple</span>]]]] = <span class="hljs-literal">None</span>,
    **kwargs,
</span>) -&gt; DataFrame:
    <span class="hljs-string">&quot;&quot;&quot;
    Loads Parquet files from Hugging Face using PyArrow, returning a PySPark \`DataFrame\`.

    It reads Parquet files in a distributed manner.

    Access private or gated repositories using \`huggingface-cli login\` or passing a token
    using the \`storage_options\` argument: \`storage_options={&quot;token&quot;: &quot;hf_xxx&quot;}\`

    Parameters
    ----------
    path : str
        Path to the file. Prefix with a protocol like \`hf://\` to read from Hugging Face.
        You can read from multiple files if you pass a globstring.
    columns : list, default None
        If not None, only these columns will be read from the file.
    filters : List[Tuple] or List[List[Tuple]], default None
        To filter out data.
        Filter syntax: [[(column, op, val), ...],...]
        where op is [==, =, &gt;, &gt;=, &lt;, &lt;=, !=, in, not in]
        The innermost tuples are transposed into a set of filters applied
        through an \`AND\` operation.
        The outer list combines these sets of filters through an \`OR\`
        operation.
        A single list of tuples can also be used, meaning that no \`OR\`
        operation between set of filters is to be conducted.

    **kwargs
        Any additional kwargs are passed to pyarrow.parquet.ParquetDataset.

    Returns
    -------
    DataFrame
        DataFrame based on parquet file.

    Examples
    --------
    &gt;&gt;&gt; path = &quot;hf://datasets/username/dataset/data.parquet&quot;
    &gt;&gt;&gt; pd.DataFrame({&quot;foo&quot;: range(5), &quot;bar&quot;: range(5, 10)}).to_parquet(path)
    &gt;&gt;&gt; read_parquet(path).show()
    +---+---+
    |foo|bar|
    +---+---+
    |  0|  5|
    |  1|  6|
    |  2|  7|
    |  3|  8|
    |  4|  9|
    +---+---+
    &gt;&gt;&gt; read_parquet(path, columns=[&quot;bar&quot;]).show()
    +---+
    |bar|
    +---+
    |  5|
    |  6|
    |  7|
    |  8|
    |  9|
    +---+
    &gt;&gt;&gt; sel = [(&quot;foo&quot;, &quot;&gt;&quot;, 2)]
    &gt;&gt;&gt; read_parquet(path, filters=sel).show()
    +---+---+
    |foo|bar|
    +---+---+
    |  3|  8|
    |  4|  9|
    +---+---+
    &quot;&quot;&quot;</span>
    filesystem: HfFileSystem = kwargs.pop(<span class="hljs-string">&quot;filesystem&quot;</span>) <span class="hljs-keyword">if</span> <span class="hljs-string">&quot;filesystem&quot;</span> <span class="hljs-keyword">in</span> kwargs <span class="hljs-keyword">else</span> HfFileSystem(**kwargs.pop(<span class="hljs-string">&quot;storage_options&quot;</span>, {}))
    paths = filesystem.glob(path)
    <span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> paths:
        <span class="hljs-keyword">raise</span> FileNotFoundError(<span class="hljs-string">f&quot;Counldn&#x27;t find any file at <span class="hljs-subst">{path}</span>&quot;</span>)
    rdd = spark.sparkContext.parallelize([{<span class="hljs-string">&quot;path&quot;</span>: path} <span class="hljs-keyword">for</span> path <span class="hljs-keyword">in</span> paths], <span class="hljs-built_in">len</span>(paths))
    df = spark.createDataFrame(rdd)
    arrow_schema = pq.read_schema(filesystem.<span class="hljs-built_in">open</span>(paths[<span class="hljs-number">0</span>]))
    schema = pa.schema([field <span class="hljs-keyword">for</span> field <span class="hljs-keyword">in</span> arrow_schema <span class="hljs-keyword">if</span> (columns <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> field.name <span class="hljs-keyword">in</span> columns)], metadata=arrow_schema.metadata)
    <span class="hljs-keyword">return</span> df.mapInArrow(
        partial(_read, columns=columns, filters=filters, filesystem=filesystem, schema=arrow_schema, **kwargs),
        from_arrow_schema(schema),
    )`,wrap:!1}}),V=new U({props:{code:"ZnJvbSUyMHB5c3Bhcmsuc3FsJTIwaW1wb3J0JTIwU3BhcmtTZXNzaW9uJTBBc3BhcmslMjAlM0QlMjBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCUyMmRlbW8lMjIpLmdldE9yQ3JlYXRlKCklMEFkZiUyMCUzRCUyMHJlYWRfcGFycXVldCglMjJoZiUzQSUyRiUyRmRhdGFzZXRzJTJGQkFBSSUyRkluZmluaXR5LUluc3RydWN0JTJGN00lMkYqLnBhcnF1ZXQlMjIpJTBBZGYuc2hvdygp",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> pyspark.sql <span class="hljs-keyword">import</span> SparkSession
<span class="hljs-meta">&gt;&gt;&gt; </span>spark = SparkSession.builder.appName(<span class="hljs-string">&quot;demo&quot;</span>).getOrCreate()
<span class="hljs-meta">&gt;&gt;&gt; </span>df = read_parquet(<span class="hljs-string">&quot;hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>df.show()
+---+----------------------------+-----+----------+--------------------+        
| <span class="hljs-built_in">id</span>|               conversations|label|langdetect|              source|
+---+----------------------------+-----+----------+--------------------+
|  <span class="hljs-number">0</span>|        [{human, <span class="hljs-keyword">def</span> <span class="hljs-title function_">exti</span>...|     |        en|      code_exercises|
|  <span class="hljs-number">1</span>|        [{human, See the ...|     |        en|                flan|
|  <span class="hljs-number">2</span>|        [{human, This <span class="hljs-keyword">is</span> ...|     |        en|                flan|
|  <span class="hljs-number">3</span>|        [{human, If you d...|     |        en|                flan|
|  <span class="hljs-number">4</span>|        [{human, In a Uni...|     |        en|                flan|
|  <span class="hljs-number">5</span>|        [{human, Read the...|     |        en|                flan|
|  <span class="hljs-number">6</span>|        [{human, You are ...|     |        en|          code_bagel|
|  <span class="hljs-number">7</span>|        [{human, I want y...|     |        en|          Subjective|
|  <span class="hljs-number">8</span>|        [{human, Given th...|     |        en|                flan|
|  <span class="hljs-number">9</span>|[{human, 因果联系原则是法...|     |     zh-cn|          Subjective|
| <span class="hljs-number">10</span>|        [{human, Provide ...|     |        en|self-oss-instruct...|
| <span class="hljs-number">11</span>|        [{human, The univ...|     |        en|                flan|
| <span class="hljs-number">12</span>|        [{human, Q: I am ...|     |        en|                flan|
| <span class="hljs-number">13</span>|        [{human, What <span class="hljs-keyword">is</span> ...|     |        en|      OpenHermes-<span class="hljs-number">2.5</span>|
| <span class="hljs-number">14</span>|        [{human, In react...|     |        en|                flan|
| <span class="hljs-number">15</span>|        [{human, Write Py...|     |        en|      code_exercises|
| <span class="hljs-number">16</span>|        [{human, Find the...|     |        en|            MetaMath|
| <span class="hljs-number">17</span>|        [{human, Three of...|     |        en|            MetaMath|
| <span class="hljs-number">18</span>|        [{human, Chandra ...|     |        en|            MetaMath|
| <span class="hljs-number">19</span>|[{human, 用经济学知识分析...|     |     zh-cn|          Subjective|
+---+----------------------------+-----+----------+--------------------+`,wrap:!1}}),z=new U({props:{code:"ZGZfbGFuZ2RldGVjdF9vbmx5JTIwJTNEJTIwcmVhZF9wYXJxdWV0KCUyMmhmJTNBJTJGJTJGZGF0YXNldHMlMkZCQUFJJTJGSW5maW5pdHktSW5zdHJ1Y3QlMkY3TSUyRioucGFycXVldCUyMiUyQyUyMGNvbHVtbnMlM0QlNUIlMjJsYW5nZGV0ZWN0JTIyJTVEKSUwQWRmX2xhbmdkZXRlY3Rfb25seS5ncm91cEJ5KCUyMmxhbmdkZXRlY3QlMjIpLmNvdW50KCkuc2hvdygp",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>df_langdetect_only = read_parquet(<span class="hljs-string">&quot;hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet&quot;</span>, columns=[<span class="hljs-string">&quot;langdetect&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>df_langdetect_only.groupBy(<span class="hljs-string">&quot;langdetect&quot;</span>).count().show()
+----------+-------+                                                            
|langdetect|  count|
+----------+-------+
|        en|<span class="hljs-number">6697793</span>|
|     zh-cn| <span class="hljs-number">751313</span>|
+----------+-------+`,wrap:!1}}),q=new U({props:{code:"Y3JpdGVyaWElMjAlM0QlMjAlNUIoJTIybGFuZ2RldGVjdCUyMiUyQyUyMCUyMiUzRCUyMiUyQyUyMCUyMnpoLWNuJTIyKSU1RCUwQWRmX2NoaW5lc2Vfb25seSUyMCUzRCUyMHJlYWRfcGFycXVldCglMjJoZiUzQSUyRiUyRmRhdGFzZXRzJTJGQkFBSSUyRkluZmluaXR5LUluc3RydWN0JTJGN00lMkYqLnBhcnF1ZXQlMjIlMkMlMjBmaWx0ZXJzJTNEY3JpdGVyaWEpJTBBZGZfY2hpbmVzZV9vbmx5LnNob3coKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>criteria = [(<span class="hljs-string">&quot;langdetect&quot;</span>, <span class="hljs-string">&quot;=&quot;</span>, <span class="hljs-string">&quot;zh-cn&quot;</span>)]
<span class="hljs-meta">&gt;&gt;&gt; </span>df_chinese_only = read_parquet(<span class="hljs-string">&quot;hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet&quot;</span>, filters=criteria)
<span class="hljs-meta">&gt;&gt;&gt; </span>df_chinese_only.show()
+---+----------------------------+-----+----------+----------+                  
| <span class="hljs-built_in">id</span>|               conversations|label|langdetect|    source|
+---+----------------------------+-----+----------+----------+
|  <span class="hljs-number">9</span>|[{human, 因果联系原则是法...|     |     zh-cn|Subjective|
| <span class="hljs-number">19</span>|[{human, 用经济学知识分析...|     |     zh-cn|Subjective|
| <span class="hljs-number">38</span>| [{human, 某个考试共有A、...|     |     zh-cn|Subjective|
| <span class="hljs-number">39</span>|[{human, 撰写一篇关于斐波...|     |     zh-cn|Subjective|
| <span class="hljs-number">57</span>|[{human, 总结世界历史上的...|     |     zh-cn|Subjective|
| <span class="hljs-number">61</span>|[{human, 生成一则广告词。...|     |     zh-cn|Subjective|
| <span class="hljs-number">66</span>|[{human, 描述一个有效的团...|     |     zh-cn|Subjective|
| <span class="hljs-number">94</span>|[{human, 如果比利和蒂芙尼...|     |     zh-cn|Subjective|
|<span class="hljs-number">102</span>|[{human, 生成一句英文名言...|     |     zh-cn|Subjective|
|<span class="hljs-number">106</span>|[{human, 写一封感谢信，感...|     |     zh-cn|Subjective|
|<span class="hljs-number">118</span>| [{human, 生成一个故事。}...|     |     zh-cn|Subjective|
|<span class="hljs-number">174</span>|[{human, 高胆固醇水平的后...|     |     zh-cn|Subjective|
|<span class="hljs-number">180</span>|[{human, 基于以下角色信息...|     |     zh-cn|Subjective|
|<span class="hljs-number">192</span>|[{human, 请写一篇文章，概...|     |     zh-cn|Subjective|
|<span class="hljs-number">221</span>|[{human, 以诗歌形式表达对...|     |     zh-cn|Subjective|
|<span class="hljs-number">228</span>|[{human, 根据给定的指令，...|     |     zh-cn|Subjective|
|<span class="hljs-number">236</span>|[{human, 打开一个新的生成...|     |     zh-cn|Subjective|
|<span class="hljs-number">260</span>|[{human, 生成一个有关未来...|     |     zh-cn|Subjective|
|<span class="hljs-number">268</span>|[{human, 如果有一定数量的...|     |     zh-cn|Subjective|
|<span class="hljs-number">273</span>| [{human, 题目：小明有<span class="hljs-number">5</span>个...|     |     zh-cn|Subjective|
+---+----------------------------+-----+----------+----------+`,wrap:!1}}),H=new ps({props:{title:"Run SQL queries",local:"run-sql-queries",headingTag:"h3"}}),$=new U({props:{code:"ZnJvbSUyMHB5c3Bhcmsuc3FsJTIwaW1wb3J0JTIwU3BhcmtTZXNzaW9uJTBBc3BhcmslMjAlM0QlMjBTcGFya1Nlc3Npb24uYnVpbGRlci5hcHBOYW1lKCUyMmRlbW8lMjIpLmdldE9yQ3JlYXRlKCklMEFkZiUyMCUzRCUyMHJlYWRfcGFycXVldCglMjJoZiUzQSUyRiUyRmRhdGFzZXRzJTJGQkFBSSUyRkluZmluaXR5LUluc3RydWN0JTJGN00lMkYqLnBhcnF1ZXQlMjIlMkMlMjBjb2x1bW5zJTNEJTVCJTIyc291cmNlJTIyJTVEKSUwQXNwYXJrLnNxbCglMjJTRUxFQ1QlMjBzb3VyY2UlMkMlMjBjb3VudCgqKSUyMEFTJTIwdG90YWwlMjBGUk9NJTIwJTdCZGYlN0QlMjBHUk9VUCUyMEJZJTIwc291cmNlJTIwT1JERVIlMjBCWSUyMHRvdGFsJTIwREVTQyUyMiUyQyUyMGRmJTNEZGYpLnNob3coKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> pyspark.sql <span class="hljs-keyword">import</span> SparkSession
<span class="hljs-meta">&gt;&gt;&gt; </span>spark = SparkSession.builder.appName(<span class="hljs-string">&quot;demo&quot;</span>).getOrCreate()
<span class="hljs-meta">&gt;&gt;&gt; </span>df = read_parquet(<span class="hljs-string">&quot;hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet&quot;</span>, columns=[<span class="hljs-string">&quot;source&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>spark.sql(<span class="hljs-string">&quot;SELECT source, count(*) AS total FROM {df} GROUP BY source ORDER BY total DESC&quot;</span>, df=df).show()
+--------------------+-------+
|              source|  total|
+--------------------+-------+
|                flan|<span class="hljs-number">2435840</span>|
|          Subjective|<span class="hljs-number">1342427</span>|
|      OpenHermes-<span class="hljs-number">2.5</span>| <span class="hljs-number">855478</span>|
|            MetaMath| <span class="hljs-number">690138</span>|
|      code_exercises| <span class="hljs-number">590958</span>|
|Orca-math-word-pr...| <span class="hljs-number">398168</span>|
|          code_bagel| <span class="hljs-number">386649</span>|
|        MathInstruct| <span class="hljs-number">329254</span>|
|python-code-datas...|  <span class="hljs-number">88632</span>|
|instructional_cod...|  <span class="hljs-number">82920</span>|
|        CodeFeedback|  <span class="hljs-number">79513</span>|
|self-oss-instruct...|  <span class="hljs-number">50467</span>|
|Evol-Instruct-Cod...|  <span class="hljs-number">43354</span>|
|CodeExercise-Pyth...|  <span class="hljs-number">27159</span>|
|code_instructions...|  <span class="hljs-number">23130</span>|
|  Code-Instruct-700k|  <span class="hljs-number">10860</span>|
|Glaive-code-assis...|   <span class="hljs-number">9281</span>|
|python_code_instr...|   <span class="hljs-number">2581</span>|
|Python-Code-23k-S...|   <span class="hljs-number">2297</span>|
+--------------------+-------+`,wrap:!1}}),L=new ps({props:{title:"Write",local:"write",headingTag:"h2"}}),K=new U({props:{code:"",highlighted:`<span class="hljs-keyword">import</span> math
<span class="hljs-keyword">import</span> pickle
<span class="hljs-keyword">import</span> tempfile
<span class="hljs-keyword">from</span> functools <span class="hljs-keyword">import</span> partial
<span class="hljs-keyword">from</span> typing <span class="hljs-keyword">import</span> Iterator, <span class="hljs-type">Optional</span>

<span class="hljs-keyword">import</span> pyarrow <span class="hljs-keyword">as</span> pa
<span class="hljs-keyword">import</span> pyarrow.parquet <span class="hljs-keyword">as</span> pq
<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> CommitOperationAdd, HfFileSystem
<span class="hljs-keyword">from</span> pyspark.sql.dataframe <span class="hljs-keyword">import</span> DataFrame
<span class="hljs-keyword">from</span> pyspark.sql.pandas.types <span class="hljs-keyword">import</span> from_arrow_schema, to_arrow_schema


<span class="hljs-keyword">def</span> <span class="hljs-title function_">_preupload</span>(<span class="hljs-params">iterator: Iterator[pa.RecordBatch], path: <span class="hljs-built_in">str</span>, schema: pa.Schema, filesystem: HfFileSystem, row_group_size: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">int</span>] = <span class="hljs-literal">None</span>, **kwargs</span>) -&gt; Iterator[pa.RecordBatch]:
    resolved_path = filesystem.resolve_path(path)
    <span class="hljs-keyword">with</span> tempfile.NamedTemporaryFile(suffix=<span class="hljs-string">&quot;.parquet&quot;</span>) <span class="hljs-keyword">as</span> temp_file:
        <span class="hljs-keyword">with</span> pq.ParquetWriter(temp_file.name, schema=schema, **kwargs) <span class="hljs-keyword">as</span> writer:
            <span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> iterator:
                writer.write_batch(batch, row_group_size=row_group_size)
        addition = CommitOperationAdd(path_in_repo=temp_file.name, path_or_fileobj=temp_file.name)
        filesystem._api.preupload_lfs_files(repo_id=resolved_path.repo_id, additions=[addition], repo_type=resolved_path.repo_type, revision=resolved_path.revision)
    <span class="hljs-keyword">yield</span> pa.record_batch({<span class="hljs-string">&quot;addition&quot;</span>: [pickle.dumps(addition)]}, schema=pa.schema({<span class="hljs-string">&quot;addition&quot;</span>: pa.binary()}))


<span class="hljs-keyword">def</span> <span class="hljs-title function_">_commit</span>(<span class="hljs-params">iterator: Iterator[pa.RecordBatch], path: <span class="hljs-built_in">str</span>, filesystem: HfFileSystem, max_operations_per_commit=<span class="hljs-number">50</span></span>) -&gt; Iterator[pa.RecordBatch]:
    resolved_path = filesystem.resolve_path(path)
    additions: <span class="hljs-built_in">list</span>[CommitOperationAdd] = [pickle.loads(addition) <span class="hljs-keyword">for</span> addition <span class="hljs-keyword">in</span> pa.Table.from_batches(iterator, schema=pa.schema({<span class="hljs-string">&quot;addition&quot;</span>: pa.binary()}))[<span class="hljs-number">0</span>].to_pylist()]
    num_commits = math.ceil(<span class="hljs-built_in">len</span>(additions) / max_operations_per_commit)
    <span class="hljs-keyword">for</span> shard_idx, addition <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(additions):
        addition.path_in_repo = resolved_path.path_in_repo.replace(<span class="hljs-string">&quot;{shard_idx:05d}&quot;</span>, <span class="hljs-string">f&quot;<span class="hljs-subst">{shard_idx:05d}</span>&quot;</span>)
    <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, num_commits):
        operations = additions[i * max_operations_per_commit : (i + <span class="hljs-number">1</span>) * max_operations_per_commit]
        commit_message = <span class="hljs-string">&quot;Upload using PySpark&quot;</span> + (<span class="hljs-string">f&quot; (part <span class="hljs-subst">{i:05d}</span>-of-<span class="hljs-subst">{num_commits:05d}</span>)&quot;</span> <span class="hljs-keyword">if</span> num_commits &gt; <span class="hljs-number">1</span> <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;&quot;</span>)
        filesystem._api.create_commit(repo_id=resolved_path.repo_id, repo_type=resolved_path.repo_type, revision=resolved_path.revision, operations=operations, commit_message=commit_message)
        <span class="hljs-keyword">yield</span> pa.record_batch({<span class="hljs-string">&quot;path&quot;</span>: [addition.path_in_repo <span class="hljs-keyword">for</span> addition <span class="hljs-keyword">in</span> operations]}, schema=pa.schema({<span class="hljs-string">&quot;path&quot;</span>: pa.string()}))


<span class="hljs-keyword">def</span> <span class="hljs-title function_">write_parquet</span>(<span class="hljs-params">df: DataFrame, path: <span class="hljs-built_in">str</span>, **kwargs</span>) -&gt; <span class="hljs-literal">None</span>:
    <span class="hljs-string">&quot;&quot;&quot;
    Write Parquet files to Hugging Face using PyArrow.

    It uploads Parquet files in a distributed manner in two steps:

    1. Preupload the Parquet files in parallel in a distributed banner
    2. Commit the preuploaded files

    Authenticate using \`huggingface-cli login\` or passing a token
    using the \`storage_options\` argument: \`storage_options={&quot;token&quot;: &quot;hf_xxx&quot;}\`

    Parameters
    ----------
    path : str
        Path of the file or directory. Prefix with a protocol like \`hf://\` to read from Hugging Face.
        It writes Parquet files in the form &quot;part-xxxxx.parquet&quot;, or to a single file if \`path ends with &quot;.parquet&quot;.

    **kwargs
        Any additional kwargs are passed to pyarrow.parquet.ParquetWriter.

    Returns
    -------
    DataFrame
        DataFrame based on parquet file.

    Examples
    --------
    &gt;&gt;&gt; spark.createDataFrame(pd.DataFrame({&quot;foo&quot;: range(5), &quot;bar&quot;: range(5, 10)}))
    &gt;&gt;&gt; # Save to one file
    &gt;&gt;&gt; write_parquet(df, &quot;hf://datasets/username/dataset/data.parquet&quot;)
    &gt;&gt;&gt; # OR save to a directory (possibly in many files)
    &gt;&gt;&gt; write_parquet(df, &quot;hf://datasets/username/dataset&quot;)
    &quot;&quot;&quot;</span>
    filesystem: HfFileSystem = kwargs.pop(<span class="hljs-string">&quot;filesystem&quot;</span>, HfFileSystem(**kwargs.pop(<span class="hljs-string">&quot;storage_options&quot;</span>, {})))
    <span class="hljs-keyword">if</span> path.endswith(<span class="hljs-string">&quot;.parquet&quot;</span>) <span class="hljs-keyword">or</span> path.endswith(<span class="hljs-string">&quot;.pq&quot;</span>):
        df = df.coalesce(<span class="hljs-number">1</span>)
    <span class="hljs-keyword">else</span>:
        path += <span class="hljs-string">&quot;/part-{shard_idx:05d}.parquet&quot;</span>
    df.mapInArrow(
        partial(_preupload, path=path, schema=to_arrow_schema(df.schema), filesystem=filesystem, **kwargs),
        from_arrow_schema(pa.schema({<span class="hljs-string">&quot;addition&quot;</span>: pa.binary()})),
    ).repartition(<span class="hljs-number">1</span>).mapInArrow(
        partial(_commit, path=path, filesystem=filesystem),
        from_arrow_schema(pa.schema({<span class="hljs-string">&quot;path&quot;</span>: pa.string()})),
    ).collect()`,wrap:!1}}),ls=new U({props:{code:"d3JpdGVfcGFycXVldChkZl9jaGluZXNlX29ubHklMkMlMjAlMjJoZiUzQSUyRiUyRmRhdGFzZXRzJTJGdXNlcm5hbWUlMkZJbmZpbml0eS1JbnN0cnVjdC1DaGluZXNlLU9ubHklMjIp",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>write_parquet(df_chinese_only, <span class="hljs-string">&quot;hf://datasets/username/Infinity-Instruct-Chinese-Only&quot;</span>)
tmph9jwu9py.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.5</span>M/<span class="hljs-number">50.5</span>M [<span class="hljs-number">00</span>:03&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">14.6</span>MB/s]
tmp0oqt99nc.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.8</span>M/<span class="hljs-number">50.8</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">17.9</span>MB/s]
tmpgnizkwqp.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.5</span>M/<span class="hljs-number">50.5</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">19.6</span>MB/s]
tmpanm04k4n.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">51.4</span>M/<span class="hljs-number">51.4</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">22.9</span>MB/s]
tmp14uy9oqb.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.4</span>M/<span class="hljs-number">50.4</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">23.0</span>MB/s]
tmpcp8t_qdl.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.4</span>M/<span class="hljs-number">50.4</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">23.5</span>MB/s]
tmpjui5mns8.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.3</span>M/<span class="hljs-number">50.3</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">24.1</span>MB/s]
tmpydqh6od1.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.9</span>M/<span class="hljs-number">50.9</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">23.8</span>MB/s]
tmp52f2t8tu.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.5</span>M/<span class="hljs-number">50.5</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">23.7</span>MB/s]
tmpg7egv3ye.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.1</span>M/<span class="hljs-number">50.1</span>M [<span class="hljs-number">00</span>:06&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">7.68</span>MB/s]
tmp2s0fq2hm.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">50.8</span>M/<span class="hljs-number">50.8</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">18.1</span>MB/s]
tmpmj97ab30.parquet: <span class="hljs-number">100</span>%|██████████| <span class="hljs-number">71.3</span>M/<span class="hljs-number">71.3</span>M [<span class="hljs-number">00</span>:02&lt;<span class="hljs-number">00</span>:<span class="hljs-number">00</span>, <span class="hljs-number">23.9</span>MB/s]`,wrap:!1}}),as=new ps({props:{title:"Run in JupyterLab on Hugging Face Spaces",local:"run-in-jupyterlab-on-hugging-face-spaces",headingTag:"h2"}}),ts=new Al({props:{source:"https://github.com/huggingface/hub-docs/blob/main/docs/hub/datasets-spark.md"}}),{c(){m=p("meta"),rs=e(),ys=p("p"),Js=e(),c(w.$$.fragment),hs=e(),T=p("p"),T.textContent=tl,os=e(),d=p("p"),d.innerHTML=pl,Us=e(),c(b.$$.fragment),ms=e(),I=p("p"),I.innerHTML=Ml,us=e(),c(C.$$.fragment),js=e(),f=p("p"),f.innerHTML=yl,ws=e(),c(B.$$.fragment),Ts=e(),c(G.$$.fragment),ds=e(),g=p("p"),g.textContent=cl,bs=e(),Z=p("p"),Z.textContent=il,Is=e(),c(k.$$.fragment),Cs=e(),S=p("p"),S.innerHTML=rl,fs=e(),c(R.$$.fragment),Bs=e(),W=p("p"),W.innerHTML=Jl,Gs=e(),c(X.$$.fragment),gs=e(),Q=p("p"),Q.textContent=hl,Zs=e(),N=p("p"),N.innerHTML=ol,ks=e(),c(Y.$$.fragment),Ss=e(),F=p("p"),F.innerHTML=Ul,Rs=e(),u=p("div"),u.innerHTML=ml,Ws=e(),A=p("p"),A.innerHTML=ul,Xs=e(),E=p("p"),E.textContent=jl,Qs=e(),c(V.$$.fragment),Ns=e(),v=p("p"),v.innerHTML=wl,Ys=e(),c(z.$$.fragment),Fs=e(),_=p("p"),_.textContent=Tl,As=e(),c(q.$$.fragment),Es=e(),c(H.$$.fragment),Vs=e(),x=p("p"),x.innerHTML=dl,vs=e(),c($.$$.fragment),zs=e(),c(L.$$.fragment),_s=e(),D=p("p"),D.textContent=bl,qs=e(),P=p("p"),P.innerHTML=Il,Hs=e(),c(K.$$.fragment),xs=e(),O=p("p"),O.innerHTML=Cl,$s=e(),ss=p("p"),ss.innerHTML=fl,Ls=e(),c(ls.$$.fragment),Ds=e(),j=p("div"),j.innerHTML=Bl,Ps=e(),c(as.$$.fragment),Ks=e(),ns=p("p"),ns.innerHTML=Gl,Os=e(),es=p("p"),es.textContent=gl,sl=e(),Ms=p("img"),ll=e(),c(ts.$$.fragment),al=e(),cs=p("p"),this.h()},l(s){const l=Yl("svelte-u9bgzb",document.head);m=M(l,"META",{name:!0,content:!0}),l.forEach(a),rs=t(s),ys=M(s,"P",{}),kl(ys).forEach(a),Js=t(s),i(w.$$.fragment,s),hs=t(s),T=M(s,"P",{"data-svelte-h":!0}),y(T)!=="svelte-s0r53o"&&(T.textContent=tl),os=t(s),d=M(s,"P",{"data-svelte-h":!0}),y(d)!=="svelte-12z8vci"&&(d.innerHTML=pl),Us=t(s),i(b.$$.fragment,s),ms=t(s),I=M(s,"P",{"data-svelte-h":!0}),y(I)!=="svelte-13h0lcm"&&(I.innerHTML=Ml),us=t(s),i(C.$$.fragment,s),js=t(s),f=M(s,"P",{"data-svelte-h":!0}),y(f)!=="svelte-ml7hwl"&&(f.innerHTML=yl),ws=t(s),i(B.$$.fragment,s),Ts=t(s),i(G.$$.fragment,s),ds=t(s),g=M(s,"P",{"data-svelte-h":!0}),y(g)!=="svelte-aqw7kt"&&(g.textContent=cl),bs=t(s),Z=M(s,"P",{"data-svelte-h":!0}),y(Z)!=="svelte-pdivv8"&&(Z.textContent=il),Is=t(s),i(k.$$.fragment,s),Cs=t(s),S=M(s,"P",{"data-svelte-h":!0}),y(S)!=="svelte-1pzllua"&&(S.innerHTML=rl),fs=t(s),i(R.$$.fragment,s),Bs=t(s),W=M(s,"P",{"data-svelte-h":!0}),y(W)!=="svelte-ygg3x4"&&(W.innerHTML=Jl),Gs=t(s),i(X.$$.fragment,s),gs=t(s),Q=M(s,"P",{"data-svelte-h":!0}),y(Q)!=="svelte-7gpw7a"&&(Q.textContent=hl),Zs=t(s),N=M(s,"P",{"data-svelte-h":!0}),y(N)!=="svelte-4fzrqm"&&(N.innerHTML=ol),ks=t(s),i(Y.$$.fragment,s),Ss=t(s),F=M(s,"P",{"data-svelte-h":!0}),y(F)!=="svelte-1vx9tic"&&(F.innerHTML=Ul),Rs=t(s),u=M(s,"DIV",{class:!0,"data-svelte-h":!0}),y(u)!=="svelte-1ct11n9"&&(u.innerHTML=ml),Ws=t(s),A=M(s,"P",{"data-svelte-h":!0}),y(A)!=="svelte-1p4gwvr"&&(A.innerHTML=ul),Xs=t(s),E=M(s,"P",{"data-svelte-h":!0}),y(E)!=="svelte-19d7o0j"&&(E.textContent=jl),Qs=t(s),i(V.$$.fragment,s),Ns=t(s),v=M(s,"P",{"data-svelte-h":!0}),y(v)!=="svelte-q1u30o"&&(v.innerHTML=wl),Ys=t(s),i(z.$$.fragment,s),Fs=t(s),_=M(s,"P",{"data-svelte-h":!0}),y(_)!=="svelte-e2gcj5"&&(_.textContent=Tl),As=t(s),i(q.$$.fragment,s),Es=t(s),i(H.$$.fragment,s),Vs=t(s),x=M(s,"P",{"data-svelte-h":!0}),y(x)!=="svelte-13i529n"&&(x.innerHTML=dl),vs=t(s),i($.$$.fragment,s),zs=t(s),i(L.$$.fragment,s),_s=t(s),D=M(s,"P",{"data-svelte-h":!0}),y(D)!=="svelte-nrb4bc"&&(D.textContent=bl),qs=t(s),P=M(s,"P",{"data-svelte-h":!0}),y(P)!=="svelte-104rxy3"&&(P.innerHTML=Il),Hs=t(s),i(K.$$.fragment,s),xs=t(s),O=M(s,"P",{"data-svelte-h":!0}),y(O)!=="svelte-1d35p3o"&&(O.innerHTML=Cl),$s=t(s),ss=M(s,"P",{"data-svelte-h":!0}),y(ss)!=="svelte-wkzo7"&&(ss.innerHTML=fl),Ls=t(s),i(ls.$$.fragment,s),Ds=t(s),j=M(s,"DIV",{class:!0,"data-svelte-h":!0}),y(j)!=="svelte-tmflad"&&(j.innerHTML=Bl),Ps=t(s),i(as.$$.fragment,s),Ks=t(s),ns=M(s,"P",{"data-svelte-h":!0}),y(ns)!=="svelte-172t2p0"&&(ns.innerHTML=Gl),Os=t(s),es=M(s,"P",{"data-svelte-h":!0}),y(es)!=="svelte-1rabvow"&&(es.textContent=gl),sl=t(s),Ms=M(s,"IMG",{src:!0}),ll=t(s),i(ts.$$.fragment,s),al=t(s),cs=M(s,"P",{}),kl(cs).forEach(a),this.h()},h(){is(m,"name","hf:doc:metadata"),is(m,"content",Vl),is(u,"class","flex justify-center"),is(j,"class","flex justify-center"),Rl(Ms.src,Zl="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/spark-on-hf-jupyterlab-screenshot-min.png")||is(Ms,"src",Zl)},m(s,l){Fl(document.head,m),n(s,rs,l),n(s,ys,l),n(s,Js,l),r(w,s,l),n(s,hs,l),n(s,T,l),n(s,os,l),n(s,d,l),n(s,Us,l),r(b,s,l),n(s,ms,l),n(s,I,l),n(s,us,l),r(C,s,l),n(s,js,l),n(s,f,l),n(s,ws,l),r(B,s,l),n(s,Ts,l),r(G,s,l),n(s,ds,l),n(s,g,l),n(s,bs,l),n(s,Z,l),n(s,Is,l),r(k,s,l),n(s,Cs,l),n(s,S,l),n(s,fs,l),r(R,s,l),n(s,Bs,l),n(s,W,l),n(s,Gs,l),r(X,s,l),n(s,gs,l),n(s,Q,l),n(s,Zs,l),n(s,N,l),n(s,ks,l),r(Y,s,l),n(s,Ss,l),n(s,F,l),n(s,Rs,l),n(s,u,l),n(s,Ws,l),n(s,A,l),n(s,Xs,l),n(s,E,l),n(s,Qs,l),r(V,s,l),n(s,Ns,l),n(s,v,l),n(s,Ys,l),r(z,s,l),n(s,Fs,l),n(s,_,l),n(s,As,l),r(q,s,l),n(s,Es,l),r(H,s,l),n(s,Vs,l),n(s,x,l),n(s,vs,l),r($,s,l),n(s,zs,l),r(L,s,l),n(s,_s,l),n(s,D,l),n(s,qs,l),n(s,P,l),n(s,Hs,l),r(K,s,l),n(s,xs,l),n(s,O,l),n(s,$s,l),n(s,ss,l),n(s,Ls,l),r(ls,s,l),n(s,Ds,l),n(s,j,l),n(s,Ps,l),r(as,s,l),n(s,Ks,l),n(s,ns,l),n(s,Os,l),n(s,es,l),n(s,sl,l),n(s,Ms,l),n(s,ll,l),r(ts,s,l),n(s,al,l),n(s,cs,l),nl=!0},p:Wl,i(s){nl||(J(w.$$.fragment,s),J(b.$$.fragment,s),J(C.$$.fragment,s),J(B.$$.fragment,s),J(G.$$.fragment,s),J(k.$$.fragment,s),J(R.$$.fragment,s),J(X.$$.fragment,s),J(Y.$$.fragment,s),J(V.$$.fragment,s),J(z.$$.fragment,s),J(q.$$.fragment,s),J(H.$$.fragment,s),J($.$$.fragment,s),J(L.$$.fragment,s),J(K.$$.fragment,s),J(ls.$$.fragment,s),J(as.$$.fragment,s),J(ts.$$.fragment,s),nl=!0)},o(s){h(w.$$.fragment,s),h(b.$$.fragment,s),h(C.$$.fragment,s),h(B.$$.fragment,s),h(G.$$.fragment,s),h(k.$$.fragment,s),h(R.$$.fragment,s),h(X.$$.fragment,s),h(Y.$$.fragment,s),h(V.$$.fragment,s),h(z.$$.fragment,s),h(q.$$.fragment,s),h(H.$$.fragment,s),h($.$$.fragment,s),h(L.$$.fragment,s),h(K.$$.fragment,s),h(ls.$$.fragment,s),h(as.$$.fragment,s),h(ts.$$.fragment,s),nl=!1},d(s){s&&(a(rs),a(ys),a(Js),a(hs),a(T),a(os),a(d),a(Us),a(ms),a(I),a(us),a(js),a(f),a(ws),a(Ts),a(ds),a(g),a(bs),a(Z),a(Is),a(Cs),a(S),a(fs),a(Bs),a(W),a(Gs),a(gs),a(Q),a(Zs),a(N),a(ks),a(Ss),a(F),a(Rs),a(u),a(Ws),a(A),a(Xs),a(E),a(Qs),a(Ns),a(v),a(Ys),a(Fs),a(_),a(As),a(Es),a(Vs),a(x),a(vs),a(zs),a(_s),a(D),a(qs),a(P),a(Hs),a(xs),a(O),a($s),a(ss),a(Ls),a(Ds),a(j),a(Ps),a(Ks),a(ns),a(Os),a(es),a(sl),a(Ms),a(ll),a(al),a(cs)),a(m),o(w,s),o(b,s),o(C,s),o(B,s),o(G,s),o(k,s),o(R,s),o(X,s),o(Y,s),o(V,s),o(z,s),o(q,s),o(H,s),o($,s),o(L,s),o(K,s),o(ls,s),o(as,s),o(ts,s)}}}const Vl='{"title":"Spark","local":"spark","sections":[{"title":"Installation","local":"installation","sections":[],"depth":2},{"title":"Authentication","local":"authentication","sections":[],"depth":2},{"title":"Read","local":"read","sections":[{"title":"Run SQL queries","local":"run-sql-queries","sections":[],"depth":3}],"depth":2},{"title":"Write","local":"write","sections":[],"depth":2},{"title":"Run in JupyterLab on Hugging Face Spaces","local":"run-in-jupyterlab-on-hugging-face-spaces","sections":[],"depth":2}],"depth":1}';function vl(el){return Xl(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class xl extends Ql{constructor(m){super(),Nl(this,m,vl,El,Sl,{})}}export{xl as component};
