From 9a818f7c42761984ac99e08e613cc20634f8410e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Jan 2024 19:20:45 +0200 Subject: [PATCH] scripts : improve get-pg.sh (#4838) --- scripts/get-pg.sh | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/scripts/get-pg.sh b/scripts/get-pg.sh index d516db46c..b027793e1 100755 --- a/scripts/get-pg.sh +++ b/scripts/get-pg.sh @@ -2,6 +2,22 @@ function usage { echo "usage: $0" + echo "note: n is the number of essays to download" + echo "for specific n, the resulting pg.txt file will have the following number of tokens:" + echo "n | tokens" + echo "--- | ---" + echo "1 | 6230" + echo "2 | 23619" + echo "5 | 25859" + echo "10 | 36888" + echo "15 | 50188" + echo "20 | 59094" + echo "25 | 88764" + echo "30 | 103121" + echo "32 | 108338" + echo "35 | 113403" + echo "40 | 127699" + echo "45 | 135896" exit 1 } @@ -33,10 +49,17 @@ if [ -f pg.txt ]; then rm pg.txt fi +c=1 for url in $urls; do echo "processing $url" - curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt + cc=$(printf "%03d" $c) + + curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt + cat pg-$cc-one.txt >> pg.txt + + cp -v pg.txt pg-$cc-all.txt + c=$((c+1)) # don't flood the server sleep 1