Add monthly export.
parent
a568e16624
commit
59922eb54c
8
Makefile
8
Makefile
|
@ -74,12 +74,20 @@ csv-iexport:
|
||||||
@sleep 10
|
@sleep 10
|
||||||
@nohup ./csv_iexport.sh > csv_iexport.out &
|
@nohup ./csv_iexport.sh > csv_iexport.out &
|
||||||
|
|
||||||
|
csv-iexport-month:
|
||||||
|
@docker compose up -d postgres
|
||||||
|
@sleep 10
|
||||||
|
@nohup ./csv_iexport_month.sh > csv_iexport_month.out &
|
||||||
|
|
||||||
kill-csv-export:
|
kill-csv-export:
|
||||||
@kill -9 `pgrep csv_export.sh`
|
@kill -9 `pgrep csv_export.sh`
|
||||||
|
|
||||||
kill-csv-iexport:
|
kill-csv-iexport:
|
||||||
@kill -9 `pgrep csv_iexport.sh`
|
@kill -9 `pgrep csv_iexport.sh`
|
||||||
|
|
||||||
|
kill-csv-iexport-month:
|
||||||
|
@kill -9 `pgrep csv_iexport_month.sh`
|
||||||
|
|
||||||
# ---------------------------- CSV Export ----------------------------
|
# ---------------------------- CSV Export ----------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,17 @@ set -e
|
||||||
|
|
||||||
# ------------------------------ Write data timestamp ----------------------------------
|
# ------------------------------ Write data timestamp ----------------------------------
|
||||||
|
|
||||||
|
date=$(date -Idate --utc)
|
||||||
|
|
||||||
|
mkdir -p ${CSV_DIR}/full
|
||||||
|
mkdir -p ${CSV_DIR}/full/${date}
|
||||||
|
|
||||||
|
echo "Output directory: ${CSV_DIR}/full/${date}"
|
||||||
|
|
||||||
to_timestamp=$(date -Iseconds --utc)
|
to_timestamp=$(date -Iseconds --utc)
|
||||||
echo "export_start" > ${CSV_DIR}/timestamp.csv
|
|
||||||
echo "${to_timestamp}" >> ${CSV_DIR}/timestamp.csv
|
echo "export_start" > ${CSV_DIR}/full/${date}/timestamp.csv
|
||||||
|
echo "${to_timestamp}" >> ${CSV_DIR}/full/${date}/timestamp.csv
|
||||||
|
|
||||||
# ------------------------------ Refresh views ----------------------------------
|
# ------------------------------ Refresh views ----------------------------------
|
||||||
|
|
||||||
|
@ -35,7 +43,7 @@ folows_started=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$folows_started', '$to_timestamp', 'app.bsky.graph.follow')"
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$folows_started', '$to_timestamp', 'app.bsky.graph.follow')"
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "copy (select * from export_follows) to stdout with csv header;" > ${CSV_DIR}/follows.csv
|
-c "copy (select * from export_follows) to stdout with csv header;" > ${CSV_DIR}/full/${date}/follows.csv
|
||||||
echo "Finishing follows export..."
|
echo "Finishing follows export..."
|
||||||
folows_finished=$(date -Iseconds --utc)
|
folows_finished=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
@ -46,7 +54,7 @@ block_started=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$block_started', '$to_timestamp', 'app.bsky.graph.block')"
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$block_started', '$to_timestamp', 'app.bsky.graph.block')"
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "copy (select * from export_blocks) to stdout with csv header;" > ${CSV_DIR}/blocks.csv
|
-c "copy (select * from export_blocks) to stdout with csv header;" > ${CSV_DIR}/full/${date}/blocks.csv
|
||||||
echo "Finishing blocks export..."
|
echo "Finishing blocks export..."
|
||||||
block_finished=$(date -Iseconds --utc)
|
block_finished=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
@ -58,7 +66,7 @@ likes_started=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$likes_started', '$to_timestamp', 'app.bsky.feed.like')"
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$likes_started', '$to_timestamp', 'app.bsky.feed.like')"
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "copy (select * from export_likes) to stdout with csv header;" > ${CSV_DIR}/like_counts.csv
|
-c "copy (select * from export_likes) to stdout with csv header;" > ${CSV_DIR}/full/${date}/like_counts.csv
|
||||||
echo "Finishing likes export..."
|
echo "Finishing likes export..."
|
||||||
likes_finished=$(date -Iseconds --utc)
|
likes_finished=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
@ -69,7 +77,7 @@ posts_started=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$posts_started', '$to_timestamp', 'app.bsky.feed.post')"
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$posts_started', '$to_timestamp', 'app.bsky.feed.post')"
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "copy (select * from export_replies) to stdout with csv header;" > ${CSV_DIR}/post_counts.csv
|
-c "copy (select * from export_replies) to stdout with csv header;" > ${CSV_DIR}/full/${date}/post_counts.csv
|
||||||
echo "Finishing posts export..."
|
echo "Finishing posts export..."
|
||||||
posts_finished=$(date -Iseconds --utc)
|
posts_finished=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
@ -80,7 +88,7 @@ dids_started=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$dids_started', '$to_timestamp', 'did')"
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$dids_started', '$to_timestamp', 'did')"
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "copy (select * from export_dids) to stdout with csv header;" > ${CSV_DIR}/dids.csv
|
-c "copy (select * from export_dids) to stdout with csv header;" > ${CSV_DIR}/full/${date}/dids.csv
|
||||||
echo "Finishing dids export..."
|
echo "Finishing dids export..."
|
||||||
dids_finished=$(date -Iseconds --utc)
|
dids_finished=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
@ -89,7 +97,7 @@ docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
echo "Starting optouts export..."
|
echo "Starting optouts export..."
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "copy (select did from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:
|
-c "copy (select did from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:
|
||||||
plc:qevje4db3tazfbbialrlrkza%') to stdout with csv header;" > ${CSV_DIR}/optout.csv
|
plc:qevje4db3tazfbbialrlrkza%') to stdout with csv header;" > ${CSV_DIR}/full/${date}/optout.csv
|
||||||
echo "Finishing optouts export..."
|
echo "Finishing optouts export..."
|
||||||
|
|
||||||
|
|
||||||
|
@ -102,8 +110,8 @@ handles_started=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$handles_started', '$to_timestamp', 'handle')"
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$handles_started', '$to_timestamp', 'handle')"
|
||||||
docker exec -t plc-postgres-1 psql -U postgres -d plc \
|
docker exec -t plc-postgres-1 psql -U postgres -d plc \
|
||||||
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > ${CSV_DIR}/handles.csv
|
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > ${CSV_DIR}/full/${date}/handles.csv
|
||||||
echo "Finishing dids export..."
|
echo "Finishing handles export..."
|
||||||
handles_finished=$(date -Iseconds --utc)
|
handles_finished=$(date -Iseconds --utc)
|
||||||
docker compose exec -it postgres psql -U postgres -d bluesky \
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
-c "update incremental_export_log set finished='$handles_finished' where started='$handles_started' and to_tsmp='$to_timestamp' and collection = 'handle'"
|
-c "update incremental_export_log set finished='$handles_finished' where started='$handles_started' and to_tsmp='$to_timestamp' and collection = 'handle'"
|
||||||
|
|
|
@ -0,0 +1,118 @@
|
||||||
|
#!/bin/bash
|
||||||
|
source .env
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# ------------------------------ Write data timestamp ----------------------------------
|
||||||
|
|
||||||
|
date=$(date -Idate --utc)
|
||||||
|
|
||||||
|
mkdir -p ${CSV_DIR}/monthly
|
||||||
|
mkdir -p ${CSV_DIR}/monthly/${date}
|
||||||
|
|
||||||
|
echo "Output directory: ${CSV_DIR}/monthly/${date}"
|
||||||
|
|
||||||
|
to_timestamp=$(date -Iseconds --utc)
|
||||||
|
echo "export_start" > ${CSV_DIR}/monthly/${date}/timestamp.csv
|
||||||
|
echo "${to_timestamp}" >> ${CSV_DIR}/monthly/${date}/timestamp.csv
|
||||||
|
|
||||||
|
# ------------------------------ Refresh views ----------------------------------
|
||||||
|
|
||||||
|
docker compose exec -iT postgres psql -U postgres -d bluesky <<- EOF
|
||||||
|
\timing
|
||||||
|
\echo Refreshing follows...
|
||||||
|
refresh materialized view export_follows_month;
|
||||||
|
\echo Refreshing like counts...
|
||||||
|
refresh materialized view export_likes_month;
|
||||||
|
\echo Refreshing reply counts...
|
||||||
|
refresh materialized view export_replies_month;
|
||||||
|
\echo Refreshing block list...
|
||||||
|
refresh materialized view export_blocks_month;
|
||||||
|
\echo Refreshing DID list...
|
||||||
|
refresh materialized view export_dids_month;
|
||||||
|
\echo Refreshing optout list...
|
||||||
|
refresh materialized view export_optouts;
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# ------------------------------ Dump views into .csv ----------------------------------
|
||||||
|
|
||||||
|
echo "Writing .csv files..."
|
||||||
|
|
||||||
|
echo "Starting follows export..."
|
||||||
|
folows_started=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$folows_started', '$to_timestamp', 'app.bsky.graph.follow_month')"
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select * from export_follows_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/follows.csv
|
||||||
|
echo "Finishing follows export..."
|
||||||
|
folows_finished=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "update incremental_export_log set finished='$folows_finished' where started='$folows_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.follow_month'"
|
||||||
|
|
||||||
|
echo "Starting blocks export..."
|
||||||
|
block_started=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$block_started', '$to_timestamp', 'app.bsky.graph.block_month')"
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select * from export_blocks_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/blocks.csv
|
||||||
|
echo "Finishing blocks export..."
|
||||||
|
block_finished=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "update incremental_export_log set finished='$block_finished' where started='$block_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.graph.block_month'"
|
||||||
|
|
||||||
|
|
||||||
|
echo "Starting likes export..."
|
||||||
|
likes_started=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$likes_started', '$to_timestamp', 'app.bsky.feed.like_month')"
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select * from export_likes_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/like_counts.csv
|
||||||
|
echo "Finishing likes export..."
|
||||||
|
likes_finished=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "update incremental_export_log set finished='$likes_finished' where started='$likes_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.like_month'"
|
||||||
|
|
||||||
|
echo "Starting posts export..."
|
||||||
|
posts_started=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$posts_started', '$to_timestamp', 'app.bsky.feed.post_month')"
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select * from export_replies_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/post_counts.csv
|
||||||
|
echo "Finishing posts export..."
|
||||||
|
posts_finished=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "update incremental_export_log set finished='$posts_finished' where started='$posts_started' and to_tsmp='$to_timestamp' and collection = 'app.bsky.feed.post_month'"
|
||||||
|
|
||||||
|
echo "Starting dids export..."
|
||||||
|
dids_started=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$dids_started', '$to_timestamp', 'did_month')"
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select * from export_dids_month) to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/dids.csv
|
||||||
|
echo "Finishing dids export..."
|
||||||
|
dids_finished=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "update incremental_export_log set finished='$dids_finished' where started='$dids_started' and to_tsmp='$to_timestamp' and collection = 'did_month'"
|
||||||
|
|
||||||
|
echo "Starting optouts export..."
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "copy (select did from repos as r inner join records_block as rb on r.id=rb.repo where rb.content['subject']::text like '%did:
|
||||||
|
plc:qevje4db3tazfbbialrlrkza%') to stdout with csv header;" > ${CSV_DIR}/monthly/${date}/optout.csv
|
||||||
|
echo "Finishing optouts export..."
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------ DO NOT Free up space used by materialized views for incremental refresh ----------------------------------
|
||||||
|
|
||||||
|
# ------------------------------ Dump handles from plc-mirror ----------------------------------
|
||||||
|
|
||||||
|
echo "Starting handles export..."
|
||||||
|
handles_started=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "insert into incremental_export_log (started, to_tsmp, collection) values ('$handles_started', '$to_timestamp', 'handle_month')"
|
||||||
|
docker exec -t plc-postgres-1 psql -U postgres -d plc \
|
||||||
|
-c 'copy (select handle, did as "did:ID" from actors) to stdout with (format csv , header, force_quote ("handle"));' | sed -E -e 's/([^\\])\\",/\1\\\\",/g' > ${CSV_DIR}/monthly/${date}/handles.csv
|
||||||
|
echo "Finishing handles export..."
|
||||||
|
handles_finished=$(date -Iseconds --utc)
|
||||||
|
docker compose exec -it postgres psql -U postgres -d bluesky \
|
||||||
|
-c "update incremental_export_log set finished='$handles_finished' where started='$handles_started' and to_tsmp='$to_timestamp' and collection = 'handle_month'"
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
-- Create a bunch of materialized views, but don't populate them right away.
|
||||||
|
|
||||||
|
create materialized view export_follows_month
|
||||||
|
as select repos.did as ":START_ID",
|
||||||
|
records.content ->> 'subject' as ":END_ID"
|
||||||
|
from repos join records on repos.id = records.repo
|
||||||
|
where records.collection = 'app.bsky.graph.follow'
|
||||||
|
and records.content ->> 'subject' <> repos.did
|
||||||
|
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
|
||||||
|
with no data;
|
||||||
|
create index export_follow_subject_month on export_follows_month (":END_ID");
|
||||||
|
|
||||||
|
-- Thanks to `join`, eats up 30GB+ of space while refreshing, but
|
||||||
|
-- finishes in under an hour.
|
||||||
|
create materialized view export_likes_month
|
||||||
|
as select repos.did as ":START_ID",
|
||||||
|
split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3) as ":END_ID",
|
||||||
|
count(*) as "count:long"
|
||||||
|
from records join repos on records.repo = repos.id
|
||||||
|
where records.collection = 'app.bsky.feed.like'
|
||||||
|
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
|
||||||
|
and repos.did <> split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
||||||
|
group by repos.did, split_part(jsonb_extract_path_text(content, 'subject', 'uri'), '/', 3)
|
||||||
|
with no data;
|
||||||
|
create index export_like_subject_month on export_likes_month (":END_ID");
|
||||||
|
|
||||||
|
create materialized view export_replies_month
|
||||||
|
as select repos.did as ":START_ID",
|
||||||
|
split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3) as ":END_ID",
|
||||||
|
count(*) as "count:long"
|
||||||
|
from records join repos on records.repo = repos.id
|
||||||
|
where records.collection = 'app.bsky.feed.post'
|
||||||
|
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
|
||||||
|
and repos.did <> split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
||||||
|
group by repos.did, split_part(jsonb_extract_path_text(content, 'reply', 'parent', 'uri'), '/', 3)
|
||||||
|
with no data;
|
||||||
|
create index export_reply_subject_month on export_replies_month (":END_ID");
|
||||||
|
|
||||||
|
create materialized view export_blocks_month
|
||||||
|
as select repos.did as ":START_ID",
|
||||||
|
records.content ->> 'subject' as ":END_ID"
|
||||||
|
from repos join records on repos.id = records.repo
|
||||||
|
where records.collection = 'app.bsky.graph.block'
|
||||||
|
and records.created_at > CURRENT_DATE - INTERVAL '30' DAY
|
||||||
|
and records.content ->> 'subject' <> repos.did
|
||||||
|
with no data;
|
||||||
|
create index export_block_subject_month on export_blocks_month (":END_ID");
|
||||||
|
|
||||||
|
|
||||||
|
create materialized view export_dids_month
|
||||||
|
as select distinct did as "did:ID" from (
|
||||||
|
select did from repos
|
||||||
|
union
|
||||||
|
select distinct ":END_ID" as did from export_follows_month
|
||||||
|
union
|
||||||
|
select distinct ":END_ID" as did from export_likes_month
|
||||||
|
union
|
||||||
|
select distinct ":END_ID" as did from export_replies_month
|
||||||
|
union
|
||||||
|
select distinct ":END_ID" as did from export_blocks_month
|
||||||
|
)
|
||||||
|
with no data;
|
Loading…
Reference in New Issue