# Uncomment the next line to install dependencies if needed.
# !pip install hpgeom pandas pyarrow

import re  # parse strings
import sys  # check size of loaded data

import hpgeom  # HEALPix math
import pandas as pd  # store and manipulate table data
import pyarrow.compute  # construct dataset filters
import pyarrow.dataset  # load and query the NEOWISE dataset
import pyarrow.fs  # interact with the S3 bucket storing the NEOWISE catalog

# All NEOWISE years => about 40G RAM and 10 minutes runtime
YEARS = list(range(1, 11))

# To reduce the needed RAM or runtime, uncomment the next line and choose your own years.
# Years 1 and 9 are needed for the median_file and biggest_file (defined below).
# YEARS = [1, 9]

# subset of columns to load
flux_columns = ["w1flux", "w1sigflux", "w2flux", "w2sigflux"]
COLUMN_SUBSET = ["cntr", "source_id", "ra", "dec"] + flux_columns

# partitioning info. do not change these values.
K = 5  # healpix order of the catalog partitioning
KCOLUMN = "healpix_k5"  # partitioning column name
KFIELD = pyarrow.compute.field(KCOLUMN)  # pyarrow compute field, to be used in filters

# We're going to look at several different files, so make a function to return the path.
def neowise_path(year, file="_metadata"):
    """Return the path to a file. Default is "_metadata" file of the given year's dataset.

    Parameters
    ----------
    year : int
        NEOWISE year for which the path is being generated.
    file : str
        The name of the file to the returned path.

    Returns
    -------
    str
        The path to the file.
    """
    # This information can be found at https://irsa.ipac.caltech.edu/cloud_access/.
    bucket = "nasa-irsa-wise"
    base_prefix = "wise/neowiser/catalogs/p1bs_psd/healpix_k5"
    root_dir = f"{bucket}/{base_prefix}/year{year}/neowiser-healpix_k5-year{year}.parquet"
    return f"{root_dir}/{file}"

# pixel index of the median partition and the biggest partition by number of rows
median_part = 10_936
biggest_part = 8_277

# path to the median file and the biggest file by file size on disk (see Appendix)
median_file = neowise_path(9, "healpix_k0=3/healpix_k5=3420/part0.snappy.parquet")
biggest_file = neowise_path(1, "healpix_k0=2/healpix_k5=2551/part0.snappy.parquet")

# We'll use this function throughout the notebook to see how big different tables are.
def print_table_size(table, pixel_index=None):
    """Prints the shape (rows x columns) and size (GiB) of the given table.

    Parameters
    ----------
    table : pyarrow.Table
        The table for which to print the size.
    pixel_index : int or str or None
        The pixel index corresponding to the partition this table was loaded from.
    """
    if pixel_index is not None:
        print(f"pixel index: {pixel_index}")
    print(f"table shape: {table.num_rows:,} rows x {table.num_columns} columns")
    print(f"table size: {sys.getsizeof(table) / 1024**3:.2f} GiB")

# This catalog is so big that even the metadata is big.
# Expect this cell to take about 30 seconds per year.
fs = pyarrow.fs.S3FileSystem(region="us-west-2", anonymous=True)

# list of datasets, one per year
year_datasets = [
    pyarrow.dataset.parquet_dataset(neowise_path(yr), filesystem=fs, partitioning="hive")
    for yr in YEARS
]

# unified dataset, all years
neowise_ds = pyarrow.dataset.dataset(year_datasets)

# number of order K pixels covering the full sky
npixels = hpgeom.nside_to_npixel(hpgeom.order_to_nside(order=K))

# iterate over all partitions
for pix in range(npixels):

    # slice and load to get all rows in this partition, subset of columns
    pixel_tbl = neowise_ds.to_table(filter=(KFIELD == pix), columns=COLUMN_SUBSET)

    # insert your code here to continue processing

    # we'll just print the table size to get a sense of how much data has been loaded
    print_table_size(table=pixel_tbl, pixel_index=pix)

    # when done, you may want to delete pixel_tbl to free the memory
    del pixel_tbl
    # we'll stop after one partition
    break

pixel index: 0
table shape: 8,168,397 rows x 8 columns
table size: 0.58 GiB

# median partition
median_part_tbl = neowise_ds.to_table(
    filter=(KFIELD == median_part), columns=COLUMN_SUBSET
)
print_table_size(table=median_part_tbl, pixel_index=median_part)

pixel index: 10936
table shape: 12,436,184 rows x 8 columns
table size: 0.89 GiB

# biggest partition
# this is very large, so we'll restrict the number of columns to one
biggest_part_tbl = neowise_ds.to_table(
    filter=(KFIELD == biggest_part), columns=COLUMN_SUBSET[:1]
)
print_table_size(table=biggest_part_tbl, pixel_index=biggest_part)

# Additional filters can be included to reduce the number of rows if desired.
# Another option is to load individual files.

pixel index: 8277
table shape: 499,017,376 rows x 1 columns
table size: 3.77 GiB

# cleanup
del median_part_tbl
del biggest_part_tbl

# slice by file and iterate
for frag in neowise_ds.get_fragments():
    # load the slice to get every row in the file, subset of columns
    file_tbl = frag.to_table(columns=COLUMN_SUBSET)

    # insert your code here to continue processing the file as desired

    # if you need to see which file this is, parse the path
    print(f"file path: {frag.path}")
    # let's see how much data this loaded
    print_table_size(table=file_tbl)

    # again, we'll stop after one
    del file_tbl
    break

file path: nasa-irsa-wise/wise/neowiser/catalogs/p1bs_psd/healpix_k5/year1/neowiser-healpix_k5-year1.parquet/healpix_k0=0/healpix_k5=0/part0.snappy.parquet
table shape: 732,246 rows x 8 columns
table size: 0.05 GiB

# median file
median_file_frag = [
    frag for frag in neowise_ds.get_fragments() if frag.path == median_file
][0]
median_file_tbl = median_file_frag.to_table(columns=COLUMN_SUBSET)
print_table_size(table=median_file_tbl)

table shape: 1,254,763 rows x 8 columns
table size: 0.09 GiB

# biggest file
biggest_file_frag = [
    frag for frag in neowise_ds.get_fragments() if frag.path == biggest_file
][0]
biggest_file_tbl = biggest_file_frag.to_table(columns=COLUMN_SUBSET)
print_table_size(table=biggest_file_tbl)

table shape: 6,000,000 rows x 8 columns
table size: 0.43 GiB

# cleanup
del median_file_tbl
del biggest_file_tbl

# slice by year and iterate. zip with YEARS so that we know which slice this is.
for year, year_ds in zip(YEARS, neowise_ds.children):
    # insert your code here to process year_ds as desired.
    # filter and load, iterate over partitions or files, etc.

    # we'll just look at some basic metadata.
    num_rows = sum(frag.metadata.num_rows for frag in year_ds.get_fragments())
    num_files = len(year_ds.files)
    print(f"NEOWISE year {year} dataset: {num_rows:,} rows in {num_files:,} files")

NEOWISE year 1 dataset: 18,468,575,586 rows in 12,441 files
NEOWISE year 2 dataset: 19,691,230,571 rows in 12,432 files
NEOWISE year 3 dataset: 19,631,135,692 rows in 12,412 files
NEOWISE year 4 dataset: 19,098,199,664 rows in 12,389 files
NEOWISE year 5 dataset: 18,795,708,783 rows in 12,361 files
NEOWISE year 6 dataset: 18,717,323,537 rows in 12,358 files
NEOWISE year 7 dataset: 18,784,693,939 rows in 12,355 files
NEOWISE year 8 dataset: 18,668,416,994 rows in 12,355 files
NEOWISE year 9 dataset: 18,650,186,132 rows in 12,355 files
NEOWISE year 10 dataset: 18,371,369,954 rows in 12,343 files

def pixel_index_from_path(path, k_column=KCOLUMN):
    """Parse the path and return the partition pixel index.

    Parameters
    ----------
    path : str
        The path to parse.
    k_column : str (optional)
        Name of the partitioning column.

    Returns
    -------
    int
        The partition pixel index parsed from the path.
    """
    pattern = rf"({k_column}=)([0-9]+)"  # matches strings like "healpix_k5=1124"
    return int(re.search(pattern, path).group(2))  # pixel index, e.g., 1124

# load some file statistics to a dataframe
file_stats = pd.DataFrame(
    columns=["path", KCOLUMN, "numrows"],
    data=[
        (frag.path, pixel_index_from_path(frag.path), frag.metadata.num_rows)
        for frag in neowise_ds.get_fragments()
    ],
)

file_stats.sample(5)

file_stats.describe()

# visualize distribution of file sizes (number of rows)
ax = file_stats.numrows.hist(log=True)
ax.set_xlabel("Number of rows")
ax.set_ylabel("Number of files")

Text(0, 0.5, 'Number of files')

# largest file
file_stats.loc[file_stats.numrows == file_stats.numrows.max()].head(1)

# median file
file_stats.sort_values("numrows").iloc[len(file_stats.index) // 2]

path          nasa-irsa-wise/wise/neowiser/catalogs/p1bs_psd...
healpix_k5                                                 3420
numrows                                                 1254763
Name: 102327, dtype: object

# get stats per partition
k_groups = file_stats[[KCOLUMN, "numrows"]].groupby(KCOLUMN)
per_part = k_groups.sum()
per_part["numfiles"] = k_groups.count()

per_part.sample(5)

per_part.describe()

# visualize number of rows per partition
per_part.numrows.plot(
    logy=True, xlabel=f"{KCOLUMN} pixel index", ylabel="Number of rows per partition"
)

<Axes: xlabel='healpix_k5 pixel index', ylabel='Number of rows per partition'>

# largest partition
per_part.loc[per_part.numrows == per_part.numrows.max()]

# median partition
per_part.sort_values("numrows").iloc[len(per_part.index) // 2]

numrows     12436184
numfiles          10
Name: 10936, dtype: int64

	path	healpix_k5	numrows
93759	nasa-irsa-wise/wise/neowiser/catalogs/p1bs_psd...	11107	1018876
85366	nasa-irsa-wise/wise/neowiser/catalogs/p1bs_psd...	9896	2205612
54579	nasa-irsa-wise/wise/neowiser/catalogs/p1bs_psd...	2854	1684879
15563	nasa-irsa-wise/wise/neowiser/catalogs/p1bs_psd...	11314	1431282
21611	nasa-irsa-wise/wise/neowiser/catalogs/p1bs_psd...	7065	795424

	healpix_k5	numrows
count	123801.000000	1.238010e+05
mean	6148.046195	1.525649e+06
std	3541.936107	8.612344e+05
min	0.000000	2.493900e+04
25%	3078.000000	9.297430e+05
50%	6155.000000	1.254763e+06
75%	9216.000000	1.848016e+06
max	12287.000000	6.000000e+06

	numrows	numfiles
count	1.228800e+04	12288.000000
mean	1.537084e+07	10.074951
std	1.231616e+07	1.563031
min	6.844274e+06	10.000000
25%	9.511571e+06	10.000000
50%	1.243604e+07	10.000000
75%	1.863406e+07	10.000000
max	4.990174e+08	87.000000

Strategies to Efficiently Work with NEOWISE Single-exposure Source Table in Parquet¶

1. Introduction¶

1.1 When to use the Parquet version¶

1.2 Recommended approach¶

1.3 See also¶

2. Imports¶

3. Setup¶

3.1 Define variables and helper functions¶

3.2 Load NEOWISE metadata as a pyarrow dataset¶

4. Example: Slice by partition¶

5. Example: Slice by file¶

6. Example: Slice by year¶

Appendix¶

A.1 Considerations when extending to specific use cases¶

A.1.1 Filtering¶

A.1.2 Slicing¶

A.2 Inspect dataset stats¶

A.2.1 Dataset statistics per file¶

A.2.2 Dataset statistics per partition¶

	numrows	numfiles
healpix_k5
69	8642941	10
7928	11136330	10
10027	22061921	10
2064	9294696	10
2798	12485587	10