Запустите scrapy crawler из представления DRF

я использовал scrapy в моем проекте и я хочу вызвать моего паука с URL из DRF (Django Rest Framework) View, какой лучший способ?

Один из способов, который я использовал:

from uuid import uuid4
from django.core.cache import cache
from urllib.parse import urlparse
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from django.views.decorators.http import require_POST, require_http_methods
from django.shortcuts import render
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
from scrapyd_api import ScrapydAPI
from main.models import Quote
from scrapy_app.scrapy_app.items import QouteItem
import os
from iCrawler.settings import BASE_DIR

scrapyd = ScrapydAPI('http://localhost:6800')


def is_valid_url(url):
    validate = URLValidator()
    try:
        validate(url)
    except ValidationError:
        return False

    return True


@csrf_exempt
@require_http_methods(['POST', 'GET'])
def crawl(request):
    if request.method == 'POST':

        url = request.POST.get('url', None) 

        if not url:
            return JsonResponse({'error': 'Missing  args'})

        if not is_valid_url(url):
            return JsonResponse({'error': 'URL is invalid'})

        domain = urlparse(url).netloc 
        unique_id = str(uuid4()) 


        settings = {
            'unique_id': unique_id,
            'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
        }


        task = scrapyd.schedule('default', 'icrawler',
                                settings=settings, url=url, domain=domain)

        return JsonResponse({'task_id': task, 'unique_id': unique_id, 'status': 'started'})

    elif request.method == 'GET':

        task_id = request.GET.get('task_id', None)
        unique_id = request.GET.get('unique_id', None)

        if not task_id or not unique_id:
            return JsonResponse({'error': 'Missing args'})

        status = scrapyd.job_status('default', task_id)
        if status == 'finished':
            try:
                item = QouteItem.objects.get(unique_id=unique_id)
                return JsonResponse({'data': item.to_dict['data']})
            except Exception as e:
                return JsonResponse({'error': str(e)})
        else:
            return JsonResponse({'status': status})

@csrf_exempt
@require_http_methods(['POST', 'GET'])
def spiding(request):
        url = request.POST.get('url')
        cwd = os.path.join(BASE_DIR, "start_spider.py")

        os.system('{} {}'.format('python3', cwd))
        try:
            os.environ.pop('start_url')
        except:
            pass
        os.environ['start_url'] = url 
        myVars = {}


        return JsonResponse({'ok': 'running spider'},status=200)
Вернуться на верх