Demo.php 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. <?php
  2. /**
  3. * Created by PhpStorm.
  4. * User: win 10
  5. * Date: 2020/9/2
  6. * Time: 10:33
  7. */
  8. namespace app\admin\controller;
  9. use think\Controller;
  10. use QL\QueryList;//内容采集类
  11. //测试功能专用
  12. class Demo extends Controller
  13. {
  14. public function gather(){//网页数据采集
  15. set_time_limit(0);//网页处理永不超时
  16. //采集网址
  17. // $url = 'https://shareably.net/';
  18. // $range ='.fullwidth>.row';
  19. //1-10页的图片都有了 下次采集就从11页开始,前10页内容和图片都已经采集
  20. $url ="https://smartlifetricks.com/category/living/page/2";
  21. $range ='.td-main-content>.td-ss-main-content';
  22. // 采集规则
  23. $rules = array (
  24. 'title'=>array(
  25. 'div>div>div>h3',//选择div下的第一个p标签
  26. 'html',//需要翻译功能的时候就改为text
  27. ),
  28. 'img'=>array(
  29. 'div>div>div>div>div>a',//选择div下的第一个p标签
  30. 'html',//需要翻译功能的时候就改为text
  31. ),
  32. // 'time'=>array(
  33. // 'time',
  34. // 'datetime',
  35. // ),
  36. );
  37. $data = QueryList::get($url)->rules($rules)->range($range)->queryData();
  38. dump($data);die;
  39. $pa = '/(src)="(.*?(jpg|jpeg|gif|png))/i';//匹配img的src链接
  40. preg_match_all($pa, $data[0]['img'], $matches);
  41. $imur = $matches[2];
  42. $reg3 = '/<a[^>]*([\s\S]*?)<\/a>/i';//匹配a标签中的内容,不需要带超链接就加上[^>]:/<a[^>]*([\s\S]*?)<\/a>/i
  43. preg_match_all($reg3,$data[0]['title'],$acontent);
  44. $ma = $acontent[1];
  45. /* $reg4 ='/<a .*?href="(.*?)".*?>/is';//匹配a标签的href地址*/
  46. // preg_match_all($reg4,$data[0]['content'],$acontents);
  47. // $mas = $acontents[1];
  48. $result=[];
  49. foreach ($ma as $key=>$content) {
  50. $temp = ['title'=>$content];
  51. // $contentapi =json_decode(file_get_contents("http://fanyi.youdao.com/translate?&doctype=json&type=AUTO&i=".$te),true);
  52. $dss['title'] = $temp['title'];
  53. $temp['img'] = isset($imur[$key])?$imur[$key]:'';
  54. $baseName = basename($temp['img']);//获取图片文件名
  55. $pattern = "/^(http|https):\/\/.*$/i";//正则匹配链接地址
  56. $dss['img'] = preg_replace($pattern, '/static/uploads/images/' . $baseName . '', $temp['img']);//将网页图片地址替换成本地图片存放地址
  57. $ds['content'] = '<img src="' . $dss['img'] . '">';
  58. $ds['title']=$dss['title'];
  59. $ds['author']='爱奇君';
  60. $ds['time'] = date('Y-m-d',time());
  61. $result[] = $ds;
  62. }
  63. dump($result);
  64. // db('article')->insertAll($result);//将采取数据插入到数据表
  65. // $pregStrs = '/<(h[1-6])>([\S\s]*?)<\/\1>/';//匹配h标签相同元素值(内容)
  66. // preg_match_all($pregStrs, $data[0]['content'], $maths);
  67. // $ma = $maths[2];
  68. // $pa = '/(src)="(.*?(jpg|jpeg|gif|png))/i';//匹配img的src链接
  69. // preg_match_all($pa, $data[0]['img'], $matches);
  70. // $imur = $matches[2];
  71. //`
  72. // $pregStr='/<h3[^>].*>(.*)<\/h3>/';//匹配相同标的元素值
  73. // preg_match_all($pregStr,$data[0]['title'],$math);
  74. // $mas = $math[1];
  75. }
  76. }