DSP

RCurl抓取

2019-07-13 16:55发布

RCurl抓取

junjun

2016年4月20日



参考:http://www.w2bc.com/Article/28489

1、RCurl抓取验证码

RCurl这个程序包提供了由R到libcurl库的接口,从而实现HTTP的一些功能。例如,从  服务器下载文件、保持连接、上传文件、采用二进制格式读取、句柄重定向、密码认证等等。 什么是curl&libcurl  – curl:利用URL语法在命令行方式下工作的开源文件传输工具  – curl背后的库就是libcurl 功能  – 获得页面  – 有关认证  – 上传下载  – 信息搜索  – …… library(RCurl) ## Loading required package: bitops myHttpheader <- c("Accept"="image/webp,*/*;q=0.8","Accept-Encoding"="gzip,deflate,sdch","Accept-Language"="zh-CN,zh;q=0.8","Cache-Control"="no-cache","Connection"="keep-alive","Pragma"="no-cache","Referer"="http://bbs.shangdu.com/s/regform.htm","User-Agent"="Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36") png <- getBinaryURL("http://push.shangdu.com/regcode.php?token=prrX73LtmN_63uEcKh0a2Z1opo1ZgdcBJ",httpheader=myHttpheader) writeBin(png, con ="E:\新技术\爬虫\验证码/123.png") #shell.exec("http://push.shangdu.com/regcode.php?token=prrX73LtmN_63uEcKh0a2Z1opo1ZgdcBJ")

2、RCurl三大函数

1)getURL函数

#1)判读URL是否存在 url.exists(url = "www.baidu.com") ## [1] TRUE #收集调试信息 d <- debugGatherer() #verbose = TRUE 这时候,d$value()值是会叠加的 tmp <- getURL(url="www.baidu.com", debugfunction=d$update, verbose=T) names(d$value()) ## [1] "text" "headerIn" "headerOut" "dataIn" "dataOut" ## [6] "sslDataIn" "sslDataOut" #服务器地址及端口号 cat(d$value()[1]) ## Rebuilt URL to: www.baidu.com/ ## Trying 180.149.132.151... ## Connected to www.baidu.com (180.149.132.151) port 80 (#0) ## Connection #0 to host www.baidu.com left intact #print(d$value()[1]) #服务器返回的头信息 cat(d$value()[2]) ## HTTP/1.1 200 OK ## Date: Wed, 20 Apr 2016 08:04:09 GMT ## Content-Type: text/html ## Content-Length: 14613 ## Last-Modified: Wed, 03 Sep 2014 02:48:32 GMT ## Connection: Keep-Alive ## Vary: Accept-Encoding ## Set-Cookie: BAIDUID=905A08993D710BA9389B6B3048107981:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com ## Set-Cookie: BIDUPSID=905A08993D710BA9389B6B3048107981; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com ## Set-Cookie: PSTM=1461139449; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com ## P3P: CP=" OTI DSP COR IVA OUR IND COM " ## Server: BWS/1.1 ## X-UA-Compatible: IE=Edge,chrome=1 ## Pragma: no-cache ## Cache-control: no-cache ## Accept-Ranges: bytes ## #提交给服务器的头信息 cat(d$value()[3]) ## GET / HTTP/1.1 ## Host: www.baidu.com ## Accept: */* ## #清除d$value(),清除之后全部为空 d$reset() d$value() ## text headerIn headerOut dataIn dataOut sslDataIn ## "" "" "" "" "" "" ## sslDataOut ## "" #2)查看服务器返回的头信息,列表形式 h <- basicHeaderGatherer() txt <- getURL(url="http://www.baidu.com", headerfunction=h$update) names(h$value()) ## [1] "Date" "Content-Type" "Content-Length" ## [4] "Last-Modified" "Connection" "Vary" ## [7] "Set-Cookie" "Set-Cookie" "Set-Cookie" ## [10] "P3P" "Server" "X-UA-Compatible" ## [13] "Pragma" "Cache-control" "Accept-Ranges" ## [16] "status" "statusMessage" h$value() # 所有的内容只是一个字符串 ## Date ## "Wed, 20 Apr 2016 08:04:10 GMT" ## Content-Type ## "text/html" ## Content-Length ## "14613" ## Last-Modified ## "Wed, 03 Sep 2014 02:48:32 GMT" ## Connection ## "Keep-Alive" ## Vary ## "Accept-Encoding" ## Set-Cookie ## "BAIDUID=48D8EBCEAFED4F0D6F78FAE967562852:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com" ## Set-Cookie ## "BIDUPSID=48D8EBCEAFED4F0D6F78FAE967562852; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com" ## Set-Cookie ## "PSTM=1461139450; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com" ## P3P ## "CP=" OTI DSP COR IVA OUR IND COM "" ## Server ## "BWS/1.1" ## X-UA-Compatible ## "IE=Edge,chrome=1" ## Pragma ## "no-cache" ## Cache-control ## "no-cache" ## Accept-Ranges ## "bytes" ## status ## "200" ## statusMessage ## "OK" cat(h$value()) # 用cat显示的,会比较好看 ## Wed, 20 Apr 2016 08:04:10 GMT text/html 14613 Wed, 03 Sep 2014 02:48:32 GMT Keep-Alive Accept-Encoding BAIDUID=48D8EBCEAFED4F0D6F78FAE967562852:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com BIDUPSID=48D8EBCEAFED4F0D6F78FAE967562852; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com PSTM=1461139450; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com CP=" OTI DSP COR IVA OUR IND COM " BWS/1.1 IE=Edge,chrome=1 no-cache no-cache bytes 200 OK #查看url请求的访问信息 curl <- getCurlHandle() txt <- getURL(url="http://www.baidu.com", curl=curl) names(getCurlInfo(curl)) ## [1] "effective.url" "response.code" ## [3] "total.time" "namelookup.time" ## [5] "connect.time" "pretransfer.time" ## [7] "size.upload" "size.download" ## [9] "speed.download" "speed.upload" ## [11] "header.size" "request.size" ## [13] "ssl.verifyresult" "filetime" ## [15] "content.length.download" "content.length.upload" ## [17] "starttransfer.time" "content.type" ## [19] "redirect.time" "redirect.count" ## [21] "private" "http.connectcode" ## [23] "httpauth.avail" "proxyauth.avail" ## [25] "os.errno" "num.connects" ## [27] "ssl.engines" "cookielist" ## [29] "lastsocket" "ftp.entry.path" ## [31] "redirect.url" "primary.ip" ## [33] "appconnect.time" "certinfo" ## [35] "condition.unmet" getCurlInfo(curl)$response.code ## [1] 200 getCurlInfo(curl=curl) ## $effective.url ## [1] "http://www.baidu.com/" ## ## $response.code ## [1] 200 ## ## $total.time ## [1] 0.063 ## ## $namelookup.time ## [1] 0 ## ## $connect.time ## [1] 0.016 ## ## $pretransfer.time ## [1] 0.016 ## ## $size.upload ## [1] 0 ## ## $size.download ## [1] 14613 ## ## $speed.download ## [1] 231952 ## ## $speed.upload ## [1] 0 ## ## $header.size ## [1] 750 ## ## $request.size ## [1] 52 ## ## $ssl.verifyresult ## [1] 0 ## ## $filetime ## [1] -1 ## ## $content.length.download ## [1] 14613 ## ## $content.length.upload ## [1] -1 ## ## $starttransfer.time ## [1] 0.032 ## ## $content.type ## [1] "text/html" ## ## $redirect.time ## [1] 0 ## ## $redirect.count ## [1] 0 ## ## $private ## NULL ## ## $http.connectcode ## [1] 0 ## ## $httpauth.avail ## [1] 0 ## ## $proxyauth.avail ## [1] 0 ## ## $os.errno ## [1] 0 ## ## $num.connects ## [1] 1 ## ## $ssl.engines ## [1] "dynamic" ## ## $cookielist ## character(0) ## ## $lastsocket ## [1] 652 ## ## $ftp.entry.path ## NULL ## ## $redirect.url ## NULL ## ## $primary.ip ## [1] "180.149.132.151" ## ## $appconnect.time ## [1] 0 ## ## $certinfo ## list() ## ## $condition.unmet ## [1] 0 #3)设置自己的header:设置自己的header,把系统设置成ihpone的系统Mac OS myheader <- c( "User-Agent"="Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A306 Safari/6531.22.7", "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language"="en-us", "Connection"="keep-alive", "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7" ) d <- debugGatherer() tmp <- getURL(url = "http://www.baidu.com", httpheader=myheader, debugfunction=d$update, verbose=T) cat(d$value()[3]) ## GET / HTTP/1.1 ## Host: www.baidu.com ## User-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A306 Safari/6531.22.7 ## Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ## Accept-Language: en-us ## Connection: keep-alive ## Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7 ## # 设置其他参数,共172个参数 listCurlOptions() ## [1] "address.scope" "append" ## [3] "autoreferer" "buffersize" ## [5] "cainfo" "capath" ## [7] "certinfo" "closepolicy" ## [9] "connect.only" "connecttimeout" ## [11] "connecttimeout.ms" "conv.from.network.function" ## [13] "conv.from.utf8.function" "conv.to.network.function" ## [15] "cookie" "cookiefile" ## [17] "cookiejar" "cookielist" ## [19] "cookiesession" "copypostfields" ## [21] "crlf" "crlfile" ## [23] "customrequest" "debugdata" ## [25] "debugfunction" "dirlistonly" ## [27] "dns.cache.timeout" "dns.use.global.cache" ## [29] "egdsocket" "encoding" ## [31] "errorbuffer" "failonerror" ## [33] "file" "filetime" ## [35] "followlocation" "forbid.reuse" ## [37] "fresh.connect" "ftp.account" ## [39] "ftp.alternative.to.user" "ftp.create.missing.dirs" ## [41] "ftp.filemethod" "ftp.response.timeout" ## [43] "ftp.skip.pasv.ip" "ftp.ssl" ## [45] "ftp.ssl.ccc" "ftp.use.eprt" ## [47] "ftp.use.epsv" "ftpappend" ## [49] "ftplistonly" "ftpport" ## [51] "ftpsslauth" "header" ## [53] "headerfunction" "http.content.decoding" ## [55] "http.transfer.decoding" "http.version" ## [57] "http200aliases" "httpauth" ## [59] "httpget" "httpheader" ## [61] "httppost" "httpproxytunnel" ## [63] "ignore.content.length" "infile" ## [65] "infilesize" "infilesize.large" ## [67] "interface" "ioctldata" ## [69] "ioctlfunction" "ipresolve" ## [71] "issuercert" "keypasswd" ## [73] "krb4level" "krblevel" ## [75] "localport" "localportrange" ## [77] "low.speed.limit" "low.speed.time" ## [79] "max.recv.speed.large" "max.send.speed.large" ## [81] "maxconnects" "maxfilesize" ## [83] "maxfilesize.large" "maxredirs" ## [85] "netrc" "netrc.file" ## [87] "new.directory.perms" "new.file.perms" ## [89] "nobody" "noprogress" ## [91] "noproxy" "nosignal" ## [93] "opensocketdata" "opensocketfunction" ## [95] "password" "port" ## [97] "post" "post301" ## [99] "postfields" "postfieldsize" ## [101] "postfieldsize.large" "postquote" ## [103] "postredir" "prequote" ## [105] "private" "progressdata" ## [107] "progressfunction" "protocols" ## [109] "proxy" "proxy.transfer.mode" ## [111] "proxyauth" "proxypassword" ## [113] "proxyport" "proxytype" ## [115] "proxyusername" "proxyuserpwd" ## [117] "put" "quote" ## [119] "random.file" "range" ## [121] "readfunction" "redir.protocols" ## [123] "referer" "resume.from" ## [125] "resume.from.large" "seekdata" ## [127] "seekfunction" "share" ## [129] "sockoptdata" "sockoptfunction" ## [131] "socks5.gssapi.nec" "socks5.gssapi.service" ## [133] "ssh.auth.types" "ssh.host.public.key.md5" ## [135] "ssh.private.keyfile" "ssh.public.keyfile" ## [137] "ssl.cipher.list" "ssl.ctx.data" ## [139] "ssl.ctx.function" "ssl.sessionid.cache" ## [141] "ssl.verifyhost" "ssl.verifypeer" ## [143] "sslcert" "sslcertpasswd" ## [145] "sslcerttype" "sslengine" ## [147] "sslengine.default" "sslkey" ## [149] "sslkeypasswd" "sslkeytype" ## [151] "sslversion" "stderr" ## [153] "tcp.nodelay" "telnetoptions" ## [155] "tftp.blksize" "timecondition" ## [157] "timeout" "timeout.ms" ## [159] "timevalue" "transfertext" ## [161] "unrestricted.auth" "upload" ## [163] "url" "use.ssl" ## [165] "useragent" "username" ## [167] "userpwd" "verbose" ## [169] "writedata" "writefunction" ## [171] "writeheader" "writeinfo"

2)getForm()函数

# wd=rcurl 这里就是关键字为rcurl url <- c("http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=2&ch=&tn=SE_hldp02870_0v135xhf&bar=&wd=rcurl&rsv_spt=1&rsv_pq=a3ed162a0088df8f&rsv_t=43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q&rsv_enter=1&inputT=2119") # 查看url的结构和值 getFormParams(query=url) ## ie ## "utf-8" ## f ## "8" ## rsv_bp ## "1" ## rsv_idx ## "2" ## ch ## "" ## tn ## "SE_hldp02870_0v135xhf" ## bar ## "" ## wd ## "rcurl" ## rsv_spt ## "1" ## rsv_pq ## "a3ed162a0088df8f" ## rsv_t ## "43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q" ## rsv_enter ## "1" ## inputT ## "2119" names(getFormParams(query=url)) ## [1] "ie" "f" "rsv_bp" "rsv_idx" "ch" ## [6] "tn" "bar" "wd" "rsv_spt" "rsv_pq" ## [11] "rsv_t" "rsv_enter" "inputT" tmp <- getForm(uri="http://www.baidu.com/s", ie="utf-8", f="8", rsv_bp="1", rsv_idx="2", ch="", tn="SE_hldp02870_0v135xhf", bar="", wd="rcurl", rsv_spt="1", rsv_pq="a3ed162a0088df8f", rsv_t="43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q", rsv_enter="1", inputT="2119") # 注意:getForm函数不稳定(原因还不知道),有时候运行2到3次,才能真正找到页面

3)getBinaryURL()下载一个文件和批量下载文件

#1)下载一个文件 url <- "http://rfunction.com/code/1201/120103.R" tmp <- getBinaryURL(url) note <- file("E:\新技术\爬虫\验证码/120103.R", open = "wb") writeBin(tmp, note) close(note) #2)批量下载文件 url <- "http://rfunction.com/code/1202/" tmp <- RCurl::getURL(url, httpheader = myheader) # 获取网页 tmp_files <- strsplit(x=tmp, split="
  • )[[1]] tmp_files1 <- strsplit(tmp_files, split=""") tmp_files2 <- lapply(X=tmp_files1, function(file) {file[1]}) files <- unlist(tmp_files2) files <- files[c(-1, -2)] baseURL <- "http://rfunction.com/code/1202/" for(i in 1:length(files)){ fullURL <- paste(baseURL, files[i], sep = "") tmp <- getBinaryURL(fullURL) note <- file(paste("E:\新技术\爬虫\验证码/1202-", files[i], sep = ""), open = "wb") writeBin(tmp, note) close(note) Sys.sleep(0.002) # 休眠0.002秒 }