#!/bin/bash function usage { echo "usage: $0" echo "note: n is the number of essays to download" echo "for specific n, the resulting pg.txt file will have the following number of tokens:" echo "n | tokens" echo "--- | ---" echo "1 | 6230" echo "2 | 23619" echo "5 | 25859" echo "10 | 36888" echo "15 | 50188" echo "20 | 59094" echo "25 | 88764" echo "30 | 103121" echo "32 | 108338" echo "35 | 113403" echo "40 | 127699" echo "45 | 135896" exit 1 } function has_cmd { if ! [ -x "$(command -v $1)" ]; then echo "error: $1 is not available" >&2 exit 1 fi } # check for: curl, html2text, tail, sed, fmt has_cmd curl has_cmd html2text has_cmd tail has_cmd sed if [ $# -ne 1 ]; then usage fi n=$1 # get urls urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)" printf "urls:\n%s\n" "$urls" if [ -f pg.txt ]; then rm pg.txt fi c=1 for url in $urls; do echo "processing $url" cc=$(printf "%03d" $c) curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt cat pg-$cc-one.txt >> pg.txt cp -v pg.txt pg-$cc-all.txt c=$((c+1)) # don't flood the server sleep 1 done echo "done. data in pg.txt" exit 0