llama.cpp/scripts/get-pg.sh

#!/bin/bash

function usage {
    echo "usage: <n>$0"
    echo "note: n is the number of essays to download"
    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
    echo "n   | tokens"
    echo "--- | ---"
    echo "1   | 6230"
    echo "2   | 23619"
    echo "5   | 25859"
    echo "10  | 36888"
    echo "15  | 50188"
    echo "20  | 59094"
    echo "25  | 88764"
    echo "30  | 103121"
    echo "32  | 108338"
    echo "35  | 113403"
    echo "40  | 127699"
    echo "45  | 135896"
    exit 1
}

function has_cmd {
    if ! [ -x "$(command -v $1)" ]; then
        echo "error: $1 is not available" >&2
        exit 1
    fi
}

# check for: curl, html2text, tail, sed, fmt
has_cmd curl
has_cmd html2text
has_cmd tail
has_cmd sed

if [ $# -ne 1 ]; then
    usage
fi

n=$1

# get urls
urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"

printf "urls:\n%s\n" "$urls"

if [ -f pg.txt ]; then
    rm pg.txt
fi

c=1
for url in $urls; do
    echo "processing $url"

    cc=$(printf "%03d" $c)

    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
    cat pg-$cc-one.txt >> pg.txt

    cp -v pg.txt pg-$cc-all.txt
    c=$((c+1))

    # don't flood the server
    sleep 1
done

echo "done. data in pg.txt"

exit 0
scripts : script to get Paul Graham essays in txt format (#4838) 2024-01-09 14:23:05 +00:00			`#!/bin/bash`

			`function usage {`
			`echo "usage: <n>$0"`
scripts : improve get-pg.sh (#4838) 2024-01-09 17:20:45 +00:00			`echo "note: n is the number of essays to download"`
			`echo "for specific n, the resulting pg.txt file will have the following number of tokens:"`
			`echo "n \| tokens"`
			`echo "--- \| ---"`
			`echo "1 \| 6230"`
			`echo "2 \| 23619"`
			`echo "5 \| 25859"`
			`echo "10 \| 36888"`
			`echo "15 \| 50188"`
			`echo "20 \| 59094"`
			`echo "25 \| 88764"`
			`echo "30 \| 103121"`
			`echo "32 \| 108338"`
			`echo "35 \| 113403"`
			`echo "40 \| 127699"`
			`echo "45 \| 135896"`
scripts : script to get Paul Graham essays in txt format (#4838) 2024-01-09 14:23:05 +00:00			`exit 1`
			`}`

			`function has_cmd {`
			`if ! [ -x "$(command -v $1)" ]; then`
			`echo "error: $1 is not available" >&2`
			`exit 1`
			`fi`
			`}`

			`# check for: curl, html2text, tail, sed, fmt`
			`has_cmd curl`
			`has_cmd html2text`
			`has_cmd tail`
			`has_cmd sed`

			`if [ $# -ne 1 ]; then`
			`usage`
			`fi`

			`n=$1`

			`# get urls`
			`urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss \| grep html \| sed -e "s/.http/http/" \| sed -e "s/html./html/" \| head -n $n)"`

			`printf "urls:\n%s\n" "$urls"`

			`if [ -f pg.txt ]; then`
			`rm pg.txt`
			`fi`

scripts : improve get-pg.sh (#4838) 2024-01-09 17:20:45 +00:00			`c=1`
scripts : script to get Paul Graham essays in txt format (#4838) 2024-01-09 14:23:05 +00:00			`for url in $urls; do`
			`echo "processing $url"`

scripts : improve get-pg.sh (#4838) 2024-01-09 17:20:45 +00:00			`cc=$(printf "%03d" $c)`

			`curl -L $url \| html2text \| tail -n +4 \| sed -E "s/^[[:space:]]+//g" \| fmt -w 80 >> pg-$cc-one.txt`
			`cat pg-$cc-one.txt >> pg.txt`

			`cp -v pg.txt pg-$cc-all.txt`
			`c=$((c+1))`
scripts : script to get Paul Graham essays in txt format (#4838) 2024-01-09 14:23:05 +00:00
			`# don't flood the server`
			`sleep 1`
			`done`

			`echo "done. data in pg.txt"`

			`exit 0`