allsuvenir/on.py

import requests
from bs4 import BeautifulSoup
import re
import json
import asyncio
import aiohttp
import time
from googleapiclient.discovery import build
from google.oauth2 import service_account
import pandas as pd
import openpyxl


SCOPES = [
'https://www.googleapis.com/auth/spreadsheets',
'https://www.googleapis.com/auth/drive'
]


credentials = service_account.Credentials.from_service_account_file('creds_service_acc.json', scopes=SCOPES)

# spreadsheet_service = build('sheets', 'v4', credentials=credentials)
# drive_service = build('drive', 'v3', credentials=credentials)

try:
  spreadsheet_service = build('sheets', 'v4', credentials=credentials)
except:
  DISCOVERY_SERVICE_URL = 'https://sheets.googleapis.com/$discovery/rest?version=v4'
  spreadsheet_service = build('sheets', 'v4', credentials=credentials, discoveryServiceUrl=DISCOVERY_SERVICE_URL)

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}

def files_jso():
    with open("./tovar.json", "w", encoding='utf8') as file:
        file.write(f'[')


def files_js():

    with open("./tovar.json", "r", encoding='utf8') as file:
        x = file.read()

    with open("./tovar.json", "w", encoding='utf8') as file1:
        file1.write(x[:-2])

    with open("./tovar.json", "a", encoding='utf8') as file3:
        file3.write(f']')

def files_tovar():
    with open("./urltovar.txt", "w", encoding='utf8') as file:
        file.write(f'')
    with open("./catalogall.txt", "w", encoding='utf8') as file:
        file.write(f'')


def prov_pagen():
    file1 = open('./catalog.txt', "r")
    while True: # Цикл для блокнота что бы пробегал по каждой строке)
        line = file1.readline()
        if not line:
            break
        url = line.strip()
        s = requests.Session()
        r = s.get(url, headers=headers).text
        soup = BeautifulSoup(r, 'html.parser')
        pagen = soup.find_all('select', {"id" : 'nb_page_items'})
        p = str(pagen)
        u = ( ' '.join(re.findall(r'>([^<>]+)</option> </select>', str(p))) )
        # print(u)
        if len(u) == 0:
            print(url)
            with open("./catalogall.txt", "a") as file:
                file.write(url + '\n')
        else:
            print(f'{url}?n={u}')
            with open("./catalogall.txt", "a") as file:
                file.write(f'{url}?n={u}' + '\n')


def catalog():
    url='https://allsuvenir.com.ua/ru/'
    s = requests.Session()
    res = s.get(url, headers=headers).text


    soup = BeautifulSoup(res, 'html.parser')
    items = soup.find_all('div', class_= 'list')
    # print(items)
    url = ( ' '.join(re.findall(r'<a href="([^<>]+)" title="">', str(items))) )
    line = url.replace(' ', '\n')
    # catalog = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line)
    # print(len(catalog))
    with open("./catalog.txt", "w") as file:
                file.write(line)


def urls_tovar():
    file1 = open('./catalogall.txt', "r")
    while True: # Цикл для блокнота что бы пробегал по каждой строке)
        line = file1.readline()
        if not line:
            break
        url = line.strip()
        # url = url1+'&n=135'
        print(url)
        # url = 'https://allsuvenir.com.ua/ru/114-zazhigalki?id_category=114&n=135'
        s = requests.Session()
        r = s.get(url, headers=headers).text

        soup = BeautifulSoup(r, 'html.parser')
        items = soup.find_all('ul', class_='product_list grid row')
        # print(items)

        url = ( ' '.join(re.findall(r'"product-name" href="([^<>]+)" itemprop="url"', str(items))) )
        line = url.replace(' ', '\n')
        # tovar = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line)
        print(line)
        with open("./urltovar.txt", "a") as file:
                file.write(line + '\n')


async def main1():
    async with aiohttp.ClientSession() as session:

            file1 = open('./urltovar.txt', "r")
            while True: # Цикл для блокнота что бы пробегал по каждой строке)
                line = file1.readline()
                if not line:

                    break
                url = line.strip()
                # url = 'https://allsuvenir.com.ua/ru/glavnaya/1344-mk14.html?search_query=MK14&results=2'

                # print(url)
                async with session.get(url, headers=headers)as response:
                    # await asyncio.sleep(0.2)


                    r = await aiohttp.StreamReader.read(response.content)
                    soup = BeautifulSoup(r, 'html.parser')
                    print(f'===========> {url}')
                    try:
                        table = soup.find_all('div', {"id" : 'center_column'})
                        for item in table:
                            json_list = []
                            # js = []
                            try:
                                name = item.find('div', class_='pb-center-column col-xs-12 col-sm-4').find('h1').get_text(strip=True)
                                print(name)
                                code = item.find('p', {"id" : 'product_reference'}).get_text(strip=True)
                                codes = code.replace('Артикул', '')

                                try:
                                    ops = item.find('div', {"id" : 'short_description_content'}).get_text()
                                    print(ops)
                                except Exception:
                                    ops = ""


                                prices = item.find('span', {"id" : 'our_price_display'}).text
                                price = prices[:prices.find('$')]
                                fotos = item.find('ul', {"id" : 'thumbs_list_frame'})
                                foto1 = ( ' '.join(re.findall(r'"other-views" href="([^<>]+)" title="', str(fotos))) )
                                foto = foto1.replace(' ', ', ')


                                js1 = {'code': f'ALL_{codes}',
                                    'name': name,
                                    'price': price,
                                    'foto': foto,
                                    'op': ops}


                                with open("./tovar.json", "a", encoding='utf8') as file:

                                    json.dump(js1, file, ensure_ascii=False)
                                    file.write(',' + '\n')
                                with open("./urltovar.txt", "r", encoding='utf8') as f:
                                    lines = f.readlines()

                                pattern = re.compile(re.escape(url))
                                with open("./urltovar.txt", "w", encoding='utf8') as f:
                                    for line in lines:
                                        result = pattern.search(line)
                                        if result is None:
                                            f.write(line)

                                print(codes)
                            except Exception as a:
                            # else:
                                print(a)
                    except Exception as a:
                    # else:
                        print(a)
                        time.sleep(100)
                        print('____________________________________________________________')
                        pass

def search_id():
    js = []
    with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file:
        rr = json.load(file)
        for r in rr:
            codes = r['code']
            names = r['name']
            price = float(r['price'])
            foto = r['foto']
            op = r['op']
            # print(op)

            ange_name = 'Export Products Sheet!H1:H400'
            spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc'  # КОД страницы)
            range_ = 'Export Products Sheet!AF1:AF15000'
            # value_render_option = '40'
            date_time_render_option = ''
            request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option)
            response = request.execute()

            # print(type(response))
            a = (response['values'])
            search_ids = [f'{codes}']
            # print(search_ids)
            time.sleep(1)
            if search_ids in a:

                # b = (a.index( search_ids) )
                # print('go search')

                for i, j in enumerate(a):
                    if j == search_ids:
                        # print(i+1)
                        stroka = i+1
                        print(f'[=>] Нашёл в таблице! Код товара: {codes} В стороке: {stroka} Цена: {price}')
                        if price > 0.00:
                            tovar = [['+']]
                            price1 = [[f'{price}']]
                            resp = spreadsheet_service.spreadsheets().values().update(
                            spreadsheetId=spreadsheet_id,
                            range="Export Products Sheet!P{}".format(stroka),
                            valueInputOption="RAW",
                            body={'values' : tovar }).execute()
                            resp2 = spreadsheet_service.spreadsheets().values().update(
                            spreadsheetId=spreadsheet_id,
                            range="Export Products Sheet!Z{}".format(stroka),
                            valueInputOption="RAW",
                            body={'values' : price1 }).execute()

                        elif price == 0.00:
                            tovar = [['-']]
                            price1 = [[f'{price}']]
                            resp = spreadsheet_service.spreadsheets().values().update(
                            spreadsheetId=spreadsheet_id,
                            range="Export Products Sheet!P{}".format(stroka),
                            valueInputOption="RAW",
                            body={'values' : tovar }).execute()
                            resp2 = spreadsheet_service.spreadsheets().values().update(
                            spreadsheetId=spreadsheet_id,
                            range="Export Products Sheet!Z{}".format(stroka),
                            valueInputOption="RAW",
                            body={'values' : price1 }).execute()
                            # print ('-------!!!!!!!')
                        else:
                            print('not')


            else:
                #
                # pass
                js1 = {'code': codes,
                            'name': names,
                            'price': price,
                            'foto': foto,
                            'op': op}
                js.append(js1)
                # print(js)
                DF = pd.DataFrame.from_dict(js)
                DF.to_excel( './all_notfiles.xlsx', sheet_name='no product', index=False)
                # print(f'[=>] Записал в файл {codes}')


def find_elements_starting_with(a, prefix):
    return [s for s in a if s.startswith(prefix)]

def finds_ids():
    # with open("./idsexes.txt", "w", encoding='utf8') as file:
    #     file.write(f'')


    lis = []
    valuesss = [[int(0)]]
    with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file:
            rr = json.load(file)
            for r in rr:
                codes1 = r['code']
                codes = f'ALL_{codes1}'
                lis.append(codes)
    # print(lis)
    ange_name = 'Export Products Sheet!H1:H400'
    spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc'#'1c5gAmgzLqbNfAC6a4DL7F4KDPXI5xn3joD0Z8Kxr-jY'  # КОД страницы)
    range_ = 'Export Products Sheet!AF2:AF15000'
    # value_render_option = '40'
    date_time_render_option = ''
    request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option)
    response = request.execute()

    # print(type(response))
    a = (response['values'])
    time.sleep(2)
    st = str(a).replace('[', '')
    st1 = st.replace(']', '')
    st2 = st1.replace("'", '')

    li = list(st2.split(", "))
    prefix = 'ALL'
    resultgoog = find_elements_starting_with(li, prefix)
    result = []
    # print(a)
    for iii in resultgoog:
        if iii not in lis:
            result.append(iii)
            search_ids = [f'{iii}']
            time.sleep(2)
            if search_ids in a:

                # b = (a.index( search_ids) )
                # print('go search')

                for i, j in enumerate(a):
                    # time.sleep(1)
                    if j == search_ids:
                        # print(i+1)
                        stroka = i+2
                        print(f'[=>] Нашёл в таблице! Код товара: {search_ids} В стороке: {stroka}! Этого товара не было при парсинге!')
                        tovar = [['-']]
                        resp = spreadsheet_service.spreadsheets().values().update(
                        spreadsheetId=spreadsheet_id,
                        range="Export Products Sheet!P{}".format(stroka),
                        valueInputOption="RAW",
                        body={'values' : tovar }).execute()

                        # resp = spreadsheet_service.spreadsheets().values().update(
                        # spreadsheetId=spreadsheet_id,
                        # range="Export Products Sheet!Z{}".format(stroka),
                        # valueInputOption="RAW",
                        # body={'values' : valuesss }).execute()


if __name__ == '__main__':
    files_tovar()
    catalog()
    prov_pagen()
    urls_tovar()
    files_jso()
    print('[*] Проверяю что осталось в файле')
    time.sleep(10)
    check_finish1 = True
    while check_finish1:
        a = len(re.findall(r"[\n]+?", open('./urltovar.txt').read()))
        print(a)
        if a > 1:
            try:
                loop = asyncio.get_event_loop()
                loop.run_until_complete(main1())
                check_finish1 = False
            except Exception:
                print('[*]Новое подключние')
                time.sleep(10)
        else:
            break
    # loop = asyncio.get_event_loop()
    # loop.run_until_complete(main1())
    files_js()
    try:
        search_id()
    except Exception as e:
        print(e)
        print('[*] Не залил весь товар.')
    #
    try:
        finds_ids()
    except Exception as e:
        print(e)
        print('[*] Не проверил весь товар которого не было на сайте.')