Use materialized views for export to csv
parent
2172651870
commit
95ca4f0eef
100
csv_export.sh
100
csv_export.sh
|
@ -2,78 +2,48 @@
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# ------------------------------ FOLLOWS ----------------------------------
|
# ------------------------------ Write data timestamp ----------------------------------
|
||||||
|
|
||||||
follows_query="$(cat <<- EOF
|
echo "export_start" > timestamp.csv
|
||||||
select repos.did as ":START_ID", records.content ->> 'subject' as ":END_ID" from repos join records on repos.id = records.repo where records.collection = 'app.bsky.graph.follow' and records.content ->> 'subject' <> repos.did
|
date -Iseconds --utc >> timestamp.csv
|
||||||
|
|
||||||
|
# ------------------------------ Refresh views ----------------------------------
|
||||||
|
|
||||||
|
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
|
||||||
|
\timing
|
||||||
|
\echo Refreshing follows...
|
||||||
|
refresh materialized view export_follows;
|
||||||
|
\echo Refreshing like counts...
|
||||||
|
refresh materialized view export_likes;
|
||||||
|
\echo Refreshing reply counts...
|
||||||
|
refresh materialized view export_replies;
|
||||||
|
\echo Refreshing DID list...
|
||||||
|
refresh materialized view export_dids;
|
||||||
EOF
|
EOF
|
||||||
)"
|
|
||||||
|
|
||||||
echo "Dumping follows..."
|
# ------------------------------ Dump views into .csv ----------------------------------
|
||||||
|
|
||||||
|
echo "Writing .csv files..."
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "copy (${follows_query}) to stdout with csv header;" > follows.csv
|
-c "copy (select * from export_follows) to stdout with csv header;" > follows.csv
|
||||||
echo "Done: $(ls -lh follows.csv)"
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select * from export_likes) to stdout with csv header;" > like_counts.csv
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select * from export_replies) to stdout with csv header;" > post_counts.csv
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select * from export_dids) to stdout with csv header;" > dids.csv
|
||||||
|
|
||||||
# ------------------------------ FOLLOWS ----------------------------------
|
# ------------------------------ Free up space used by materialized views ----------------------------------
|
||||||
|
|
||||||
# ------------------------------ LIKES ----------------------------------
|
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
|
||||||
|
\timing
|
||||||
likes_query="$(cat <<- EOF
|
refresh materialized view export_follows with no data;
|
||||||
select repos.did as ":START_ID", subject_did as ":END_ID", "count:long" from
|
refresh materialized view export_likes with no data;
|
||||||
repos join lateral (
|
refresh materialized view export_replies with no data;
|
||||||
select repo, split_part(content['subject'] ->> 'uri', '/', 3) as subject_did, count(*) as "count:long" from records where repos.id = records.repo AND records.collection = 'app.bsky.feed.like' group by repo, split_part(content['subject'] ->> 'uri', '/', 3)
|
refresh materialized view export_dids with no data;
|
||||||
) as r on repos.id = r.repo
|
|
||||||
where repos.did <> subject_did
|
|
||||||
EOF
|
EOF
|
||||||
)"
|
|
||||||
|
|
||||||
echo "Dumping likes..."
|
# ------------------------------ Dump handles from plc-mirror ----------------------------------
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (${likes_query}) to stdout with csv header;" > like_counts.csv
|
|
||||||
echo "Done: $(ls -lh like_counts.csv)"
|
|
||||||
|
|
||||||
# ------------------------------ LIKES ----------------------------------
|
|
||||||
|
|
||||||
# ------------------------------ REPLIES ----------------------------------
|
|
||||||
|
|
||||||
posts_query="$(cat <<- EOF
|
|
||||||
select repos.did as ":START_ID", subject_did as ":END_ID", "count:long" from
|
|
||||||
repos join lateral (
|
|
||||||
select repo, split_part(content['reply']['parent'] ->> 'uri', '/', 3) as subject_did, count(*) as "count:long" from records where repos.id = records.repo AND records.collection = 'app.bsky.feed.post' group by repo, split_part(content['reply']['parent'] ->> 'uri', '/', 3)
|
|
||||||
) as r on repos.id = r.repo
|
|
||||||
where repos.did <> subject_did
|
|
||||||
EOF
|
|
||||||
)"
|
|
||||||
|
|
||||||
echo "Dumping posts..."
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "copy (${posts_query}) to stdout with csv header;" > post_counts.csv
|
|
||||||
echo "Done: $(ls -lh post_counts.csv)"
|
|
||||||
|
|
||||||
# ------------------------------ REPLIES ----------------------------------
|
|
||||||
|
|
||||||
# ------------------------------ DIDS + HANDLES ----------------------------------
|
|
||||||
|
|
||||||
dids_query="$(cat <<- EOF
|
|
||||||
insert into repos (did)
|
|
||||||
select distinct did from (
|
|
||||||
select distinct (split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)) as did from records_post where collection='app.bsky.feed.post'
|
|
||||||
union
|
|
||||||
select distinct (split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)) from records where collection='app.bsky.feed.like'
|
|
||||||
union
|
|
||||||
select distinct (jsonb_extract_path_text(content, 'subject')) from records where collection='app.bsky.graph.follow'
|
|
||||||
)
|
|
||||||
on conflict (did) do nothing;
|
|
||||||
copy (select did as "did:ID" from repos) to stdout with csv header;
|
|
||||||
EOF
|
|
||||||
)"
|
|
||||||
|
|
||||||
echo "Dumping DIDs..."
|
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
|
||||||
-c "${dids_query}" > dids.csv
|
|
||||||
echo "Done: $(ls -lh dids.csv)"
|
|
||||||
|
|
||||||
docker exec -it plc-postgres-1 psql -U postgres -d plc \
|
docker exec -it plc-postgres-1 psql -U postgres -d plc \
|
||||||
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' > handles.csv
|
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > handles.csv
|
||||||
|
|
||||||
# ------------------------------ HANDLES ----------------------------------
|
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
-- Create a bunch of materialized views, but don't populate them right away.
|
||||||
|
|
||||||
|
create materialized view export_follows
|
||||||
|
as select repos.did as ":START_ID",
|
||||||
|
records.content ->> 'subject' as ":END_ID"
|
||||||
|
from repos join records on repos.id = records.repo
|
||||||
|
where records.collection = 'app.bsky.graph.follow'
|
||||||
|
and records.content ->> 'subject' <> repos.did
|
||||||
|
with no data;
|
||||||
|
create index export_follow_subject on export_follows (":END_ID");
|
||||||
|
|
||||||
|
-- Thanks to `join`, eats up 30GB+ of space while refreshing, but
|
||||||
|
-- finishes in under an hour.
|
||||||
|
create materialized view export_likes
|
||||||
|
as select repos.did as ":START_ID",
|
||||||
|
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
|
||||||
|
count(*) as "count:long"
|
||||||
|
from records join repos on records.repo = repos.id
|
||||||
|
where records.collection = 'app.bsky.feed.like'
|
||||||
|
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
||||||
|
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
||||||
|
with no data;
|
||||||
|
create index export_like_subject on export_likes (":END_ID");
|
||||||
|
|
||||||
|
create materialized view export_replies
|
||||||
|
as select repos.did as ":START_ID",
|
||||||
|
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
|
||||||
|
count(*) as "count:long"
|
||||||
|
from records join repos on records.repo = repos.id
|
||||||
|
where records.collection = 'app.bsky.feed.post'
|
||||||
|
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
||||||
|
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
||||||
|
with no data;
|
||||||
|
create index export_reply_subject on export_replies (":END_ID");
|
||||||
|
|
||||||
|
create materialized view export_dids
|
||||||
|
as select distinct did as "did:ID" from (
|
||||||
|
select did from repos
|
||||||
|
union
|
||||||
|
select distinct ":END_ID" as did from export_follows
|
||||||
|
union
|
||||||
|
select distinct ":END_ID" as did from export_likes
|
||||||
|
union
|
||||||
|
select distinct ":END_ID" as did from export_replies
|
||||||
|
)
|
||||||
|
with no data;
|
Loading…
Reference in New Issue