用Shell实现简单的Web爬虫 Sep 15th, 2013 对于结构不太复杂的网页,使用grep和sed分析提取目标URL,之后使用wget下载。 以下是两个例子。 抓取煎蛋“妹子图”栏目 #!/bin/sh delay=0 timeout=10 retry=2 usage() { echo "Usage: `basename $0` frompage topage" exit 1 } [ $# -ne 2 ] && usage [ $1 -le 0 ] || [ $2 -le 0 ] || [ $1 -gt $2 ] && usage page=$1 while [ $page -le $2 ] ; do echo "[INFO] Downloading page $page ..." imgs=`wget -q -O - http://jandan.net/ooxx/page-$page | grep -ioE '<img src="http.*\.(jpg|gif)" */>' | sed 's/<[Ii][Mm][Gg].*[Ss][Rr][Cc]\s*=\s*"*//;s/[" ].*//'` [ -z "$imgs" ] && { echo "[WARNING] Failed to get image list of page $page" page=$((page+1)) continue } index=1 for img in $imgs ; do wget -q --tries=$retry --timeout=$timeout -O "$page-$index.${img##*.}" "$img" if [ $? -ne 0 ] ; then rm -f "$page-$index.${img##*.}" echo "[WARNING] Failed to download $img" else index=$((index+1)) fi done page=$((page+1)) [ $page -le $2 ] && sleep $delay done exit 0 抓取Panoramio网站某个用户上传的所有照片 #!/bin/sh delay=0 timeout=10 retry=2 usage() { echo "Usage: `basename $0` userid" exit 1 } [ $# -ne 1 ] && usage global_index=1 page=1 while true ; do echo "[INFO] Processing page $page" wget -q -O .tmp http://www.panoramio.com/user/$1?photo_page=$page if [ $? -ne 0 ] ; then echo "[WARNING] Can not get content of page $page" break fi if [ -z "$total" ] ; then total=`grep -iE '<a href="\/user\/.*\/stats' .tmp | head -n 1 | sed 's/.*stats">//;s/<.*//'` fi imgs=`grep -ioE '<a href="\/photo\/[0-9]*"$' .tmp | sed 's/.*\///;s/"//'` alts=`grep -ioE '^ >..*<\/a>$' .tmp | sed 's/<.*//;s/.*>//'` index=1 for img in $imgs ; do url=http://static.panoramio.com/photos/original/$img.jpg alt=`{ for alt in $alts ; do echo $alt ; done } | head -n $index | tail -n 1` file=`echo $global_index | awk '{printf "%03d", $0}'`_$alt.jpg index=$((index+1)) global_index=$((global_index+1)) if [ -f $file ] ; then continue fi echo "[INFO] Downloading $url" wget -q --tries=$retry --timeout=$timeout -O "$file" "$url" if [ $? -ne 0 ] ; then rm -f "$file" echo "[WARNING] Failed to download $url to $file" fi sleep $delay done if [ $global_index -gt $total ] ; then break fi page=$((page+1)) done rm -f .tmp exit 0