Tests.php 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. <?php
  2. /**
  3. * Created by PhpStorm.
  4. * User: win 10
  5. * Date: 2020/8/14
  6. * Time: 9:49
  7. */
  8. namespace app\admin\controller;
  9. use Aliyun\Common\Utilities\DateUtils;
  10. use phpDocumentor\Reflection\FqsenResolver;
  11. use think\Controller;
  12. use tp5redis\redis\driver\Redis;//redis扩展类
  13. use think\Request;
  14. use QL\QueryList;//内容采集类
  15. //测试功能专用
  16. class Tests extends Controller
  17. {
  18. public function gather(){//网页数据采集
  19. set_time_limit(0);//网页处理永不超时
  20. //采集网址
  21. // $url = 'https://shareably.net/';
  22. // $range ='.fullwidth>.row';
  23. //1-10页的图片都有了 下次采集就从11页开始,前10页内容和图片都已经采集
  24. $url ="https://smartlifetricks.com/category/living/page/2";
  25. $range ='.td-main-content>.td-ss-main-content';
  26. // 采集规则
  27. $rules = array (
  28. 'title'=>array(
  29. 'div>div>div>h3',//选择div下的第一个p标签
  30. 'html',//需要翻译功能的时候就改为text
  31. ),
  32. 'img'=>array(
  33. 'div>div>div>div>div>a',//选择div下的第一个p标签
  34. 'html',//需要翻译功能的时候就改为text
  35. ),
  36. // 'time'=>array(
  37. // 'time',
  38. // 'datetime',
  39. // ),
  40. );
  41. $data = QueryList::get($url)->rules($rules)->range($range)->queryData();
  42. dump($data);die;
  43. $pa = '/(src)="(.*?(jpg|jpeg|gif|png))/i';//匹配img的src链接
  44. preg_match_all($pa, $data[0]['img'], $matches);
  45. $imur = $matches[2];
  46. $reg3 = '/<a[^>]*([\s\S]*?)<\/a>/i';//匹配a标签中的内容,不需要带超链接就加上[^>]:/<a[^>]*([\s\S]*?)<\/a>/i
  47. preg_match_all($reg3,$data[0]['title'],$acontent);
  48. $ma = $acontent[1];
  49. /* $reg4 ='/<a .*?href="(.*?)".*?>/is';//匹配a标签的href地址*/
  50. // preg_match_all($reg4,$data[0]['content'],$acontents);
  51. // $mas = $acontents[1];
  52. $result=[];
  53. foreach ($ma as $key=>$content) {
  54. $temp = ['title'=>$content];
  55. // $contentapi =json_decode(file_get_contents("http://fanyi.youdao.com/translate?&doctype=json&type=AUTO&i=".$te),true);
  56. $dss['title'] = $temp['title'];
  57. $temp['img'] = isset($imur[$key])?$imur[$key]:'';
  58. $baseName = basename($temp['img']);//获取图片文件名
  59. $pattern = "/^(http|https):\/\/.*$/i";//正则匹配链接地址
  60. $dss['img'] = preg_replace($pattern, '/static/uploads/images/' . $baseName . '', $temp['img']);//将网页图片地址替换成本地图片存放地址
  61. $ds['content'] = '<img src="' . $dss['img'] . '">';
  62. $ds['title']=$dss['title'];
  63. $ds['author']='爱奇君';
  64. $ds['time'] = date('Y-m-d',time());
  65. $result[] = $ds;
  66. }
  67. dump($result);
  68. // db('article')->insertAll($result);//将采取数据插入到数据表
  69. // $data = QueryList::get($url)->rules($rules)->range($range)->queryData();
  70. // //dump($data);
  71. // $pregStrs = '/<(h[1-6])>([\S\s]*?)<\/\1>/';//匹配h标签相同元素值(内容)
  72. // preg_match_all($pregStrs, $data[0]['content'], $maths);
  73. // $ma = $maths[2];
  74. //
  75. // $pa = '/(src)="(.*?(jpg|jpeg|gif|png))/i';//匹配img的src链接
  76. // preg_match_all($pa, $data[0]['img'], $matches);
  77. // $imur = $matches[2];
  78. //`
  79. // $result=[];
  80. // foreach ($ma as $key=>$content) {
  81. // $temp = ['content'=>$content];
  82. // $contentapi = json_decode(file_get_contents("http://fanyi.youdao.com/openapi.do?keyfrom=xujiangtao&key=1490852988&type=data&doctype=json&version=1.1&q=".$temp['content']),true);
  83. // $contentapi =json_decode(file_get_contents("http://fanyi.youdao.com/translate?&doctype=json&type=AUTO&i=".$temp['content']),true);
  84. // $dss['content'] = $contentapi['translation'][0];
  85. //
  86. // $temp['img'] = isset($imur[$key])?$imur[$key]:'';
  87. // $baseName = basename($temp['img']);//获取图片文件名
  88. // $pattern = "/^(http|https):\/\/.*$/i";//正则匹配链接地址
  89. //
  90. // $dss['img'] = preg_replace($pattern, '/static/uploads/images/' . $baseName . '', $temp['img']);//将网页图片地址替换成本地图片存放地址
  91. //
  92. // $ds['content'] = '
  93. // <p>' . $dss['content'] . '</p>
  94. //
  95. // <img src="' . $dss['img'] . '">';
  96. //
  97. // $ds['title']="生活";
  98. // $ds['author']='爱奇君';
  99. // $ds['time'] = date('Y-m-d',time());
  100. //
  101. // $result[] = $ds;
  102. // }
  103. //// dump($result);
  104. // db('article')->insertAll($result);//将采取数据插入到数据表
  105. // $pregStr='/<h3[^>].*>(.*)<\/h3>/';//匹配相同标的元素值
  106. // preg_match_all($pregStr,$data[0]['title'],$math);
  107. // $mas = $math[1];
  108. }
  109. //下载网页图片到本地
  110. /**
  111. * $imgUrl:图片地址
  112. */
  113. public function saveImg($imgUrl){
  114. set_time_limit(0);
  115. $ext=strrchr($imgUrl,'.');
  116. if(!in_array($ext,['.jpg','.png','.jpeg','.gif']))
  117. return $imgUrl;
  118. $baseName=basename($imgUrl);//获取文件名
  119. $saveUrl="/upload/img/".$baseName;
  120. //文件保存绝对路径
  121. $path=__DIR__.DS.'../../../public/static/uploads'.DS.$baseName;
  122. $img = file_get_contents($imgUrl);
  123. file_put_contents($path,$img);
  124. return $saveUrl;
  125. }
  126. public function downImg(){//批量下载网页图片保存到本地项目文件下
  127. set_time_limit(0);
  128. // $url = 'https://shareably.net/';
  129. // $range ='.fullwidth>.row';
  130. //
  131. // $rules = array (
  132. // 'img'=>array(
  133. // 'div>ol>li>div>div',//选择div下的第一个p标签
  134. // 'html',//需要翻译功能的时候就改为text
  135. // )
  136. // );
  137. $url = 'https://smartlifetricks.com/category/living/page/11';
  138. $range ='.td-main-content>.td-ss-main-content';
  139. $rules = array (
  140. 'img'=>array(
  141. 'div>div>div>div>div>a',//选择div下的第一个p标签
  142. 'html',//需要翻译功能的时候就改为text
  143. ),
  144. );
  145. $data = QueryList::get($url)->rules($rules)->range($range)->queryData();
  146. $pa = '/(src)="(.*?(jpg|jpeg|gif|png))/i';//匹配img的src链接
  147. preg_match_all($pa, $data[0]['img'], $matches);
  148. $imur = $matches[2];
  149. dump($imur);die;
  150. foreach ( $imur as $url ) {//因为 $imgUrl是一个数组,里面有多张图片路径,所以直接循环调用 saveImg()方法
  151. $this->saveImg($url);
  152. }
  153. }
  154. }