|
@@ -1,108 +0,0 @@
|
|
|
-<?php
|
|
|
-/**
|
|
|
- * Created by PhpStorm.
|
|
|
- * User: win 10
|
|
|
- * Date: 2020/9/2
|
|
|
- * Time: 10:33
|
|
|
- */
|
|
|
-
|
|
|
-namespace app\admin\controller;
|
|
|
-use think\Controller;
|
|
|
-use QL\QueryList;//内容采集类
|
|
|
-//测试功能专用
|
|
|
-
|
|
|
-
|
|
|
-class Demo extends Controller
|
|
|
-{
|
|
|
- public function gather(){//网页数据采集
|
|
|
- set_time_limit(0);//网页处理永不超时
|
|
|
- //采集网址
|
|
|
-// $url = 'https://shareably.net/';
|
|
|
-// $range ='.fullwidth>.row';
|
|
|
-
|
|
|
-
|
|
|
-//1-10页的图片都有了 下次采集就从11页开始,前10页内容和图片都已经采集
|
|
|
- $url ="https://smartlifetricks.com/category/living/page/2";
|
|
|
- $range ='.td-main-content>.td-ss-main-content';
|
|
|
-
|
|
|
-// 采集规则
|
|
|
- $rules = array (
|
|
|
-
|
|
|
- 'title'=>array(
|
|
|
- 'div>div>div>h3',//选择div下的第一个p标签
|
|
|
- 'html',//需要翻译功能的时候就改为text
|
|
|
- ),
|
|
|
-
|
|
|
- 'img'=>array(
|
|
|
- 'div>div>div>div>div>a',//选择div下的第一个p标签
|
|
|
- 'html',//需要翻译功能的时候就改为text
|
|
|
- ),
|
|
|
-
|
|
|
-// 'time'=>array(
|
|
|
-// 'time',
|
|
|
-// 'datetime',
|
|
|
-// ),
|
|
|
-
|
|
|
-
|
|
|
- );
|
|
|
-
|
|
|
- $data = QueryList::get($url)->rules($rules)->range($range)->queryData();
|
|
|
-
|
|
|
-
|
|
|
- dump($data);die;
|
|
|
-
|
|
|
- $pa = '/(src)="(.*?(jpg|jpeg|gif|png))/i';//匹配img的src链接
|
|
|
- preg_match_all($pa, $data[0]['img'], $matches);
|
|
|
- $imur = $matches[2];
|
|
|
-
|
|
|
-
|
|
|
- $reg3 = '/<a[^>]*([\s\S]*?)<\/a>/i';//匹配a标签中的内容,不需要带超链接就加上[^>]:/<a[^>]*([\s\S]*?)<\/a>/i
|
|
|
- preg_match_all($reg3,$data[0]['title'],$acontent);
|
|
|
- $ma = $acontent[1];
|
|
|
-
|
|
|
-
|
|
|
- /* $reg4 ='/<a .*?href="(.*?)".*?>/is';//匹配a标签的href地址*/
|
|
|
-// preg_match_all($reg4,$data[0]['content'],$acontents);
|
|
|
-// $mas = $acontents[1];
|
|
|
-
|
|
|
- $result=[];
|
|
|
- foreach ($ma as $key=>$content) {
|
|
|
- $temp = ['title'=>$content];
|
|
|
-// $contentapi =json_decode(file_get_contents("http://fanyi.youdao.com/translate?&doctype=json&type=AUTO&i=".$te),true);
|
|
|
- $dss['title'] = $temp['title'];
|
|
|
- $temp['img'] = isset($imur[$key])?$imur[$key]:'';
|
|
|
- $baseName = basename($temp['img']);//获取图片文件名
|
|
|
- $pattern = "/^(http|https):\/\/.*$/i";//正则匹配链接地址
|
|
|
- $dss['img'] = preg_replace($pattern, '/static/uploads/images/' . $baseName . '', $temp['img']);//将网页图片地址替换成本地图片存放地址
|
|
|
- $ds['content'] = '<img src="' . $dss['img'] . '">';
|
|
|
- $ds['title']=$dss['title'];
|
|
|
- $ds['author']='爱奇君';
|
|
|
- $ds['time'] = date('Y-m-d',time());
|
|
|
-
|
|
|
- $result[] = $ds;
|
|
|
- }
|
|
|
- dump($result);
|
|
|
-
|
|
|
-// db('article')->insertAll($result);//将采取数据插入到数据表
|
|
|
-
|
|
|
-// $pregStrs = '/<(h[1-6])>([\S\s]*?)<\/\1>/';//匹配h标签相同元素值(内容)
|
|
|
-// preg_match_all($pregStrs, $data[0]['content'], $maths);
|
|
|
-// $ma = $maths[2];
|
|
|
-
|
|
|
-// $pa = '/(src)="(.*?(jpg|jpeg|gif|png))/i';//匹配img的src链接
|
|
|
-// preg_match_all($pa, $data[0]['img'], $matches);
|
|
|
-// $imur = $matches[2];
|
|
|
-//`
|
|
|
-
|
|
|
-// $pregStr='/<h3[^>].*>(.*)<\/h3>/';//匹配相同标的元素值
|
|
|
-// preg_match_all($pregStr,$data[0]['title'],$math);
|
|
|
-// $mas = $math[1];
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-}
|