allsuvenir/on.py

387 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import re
import json
import asyncio
import aiohttp
import time
from googleapiclient.discovery import build
from google.oauth2 import service_account
import pandas as pd
import openpyxl
SCOPES = [
'https://www.googleapis.com/auth/spreadsheets',
'https://www.googleapis.com/auth/drive'
]
credentials = service_account.Credentials.from_service_account_file('creds_service_acc.json', scopes=SCOPES)
# spreadsheet_service = build('sheets', 'v4', credentials=credentials)
# drive_service = build('drive', 'v3', credentials=credentials)
try:
spreadsheet_service = build('sheets', 'v4', credentials=credentials)
except:
DISCOVERY_SERVICE_URL = 'https://sheets.googleapis.com/$discovery/rest?version=v4'
spreadsheet_service = build('sheets', 'v4', credentials=credentials, discoveryServiceUrl=DISCOVERY_SERVICE_URL)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
def files_jso():
with open("./tovar.json", "w", encoding='utf8') as file:
file.write(f'[')
def files_js():
with open("./tovar.json", "r", encoding='utf8') as file:
x = file.read()
with open("./tovar.json", "w", encoding='utf8') as file1:
file1.write(x[:-2])
with open("./tovar.json", "a", encoding='utf8') as file3:
file3.write(f']')
def files_tovar():
with open("./urltovar.txt", "w", encoding='utf8') as file:
file.write(f'')
with open("./catalogall.txt", "w", encoding='utf8') as file:
file.write(f'')
def prov_pagen():
file1 = open('./catalog.txt', "r")
while True: # Цикл для блокнота что бы пробегал по каждой строке)
line = file1.readline()
if not line:
break
url = line.strip()
s = requests.Session()
r = s.get(url, headers=headers).text
soup = BeautifulSoup(r, 'html.parser')
pagen = soup.find_all('select', {"id" : 'nb_page_items'})
p = str(pagen)
u = ( ' '.join(re.findall(r'>([^<>]+)</option> </select>', str(p))) )
# print(u)
if len(u) == 0:
print(url)
with open("./catalogall.txt", "a") as file:
file.write(url + '\n')
else:
print(f'{url}?n={u}')
with open("./catalogall.txt", "a") as file:
file.write(f'{url}?n={u}' + '\n')
def catalog():
url='https://allsuvenir.com.ua/ru/'
s = requests.Session()
res = s.get(url, headers=headers).text
soup = BeautifulSoup(res, 'html.parser')
items = soup.find_all('div', class_= 'list')
# print(items)
url = ( ' '.join(re.findall(r'<a href="([^<>]+)" title="">', str(items))) )
line = url.replace(' ', '\n')
# catalog = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line)
# print(len(catalog))
with open("./catalog.txt", "w") as file:
file.write(line)
def urls_tovar():
file1 = open('./catalogall.txt', "r")
while True: # Цикл для блокнота что бы пробегал по каждой строке)
line = file1.readline()
if not line:
break
url = line.strip()
# url = url1+'&n=135'
print(url)
# url = 'https://allsuvenir.com.ua/ru/114-zazhigalki?id_category=114&n=135'
s = requests.Session()
r = s.get(url, headers=headers).text
soup = BeautifulSoup(r, 'html.parser')
items = soup.find_all('ul', class_='product_list grid row')
# print(items)
url = ( ' '.join(re.findall(r'"product-name" href="([^<>]+)" itemprop="url"', str(items))) )
line = url.replace(' ', '\n')
# tovar = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line)
print(line)
with open("./urltovar.txt", "a") as file:
file.write(line + '\n')
async def main1():
async with aiohttp.ClientSession() as session:
file1 = open('./urltovar.txt', "r")
while True: # Цикл для блокнота что бы пробегал по каждой строке)
line = file1.readline()
if not line:
break
url = line.strip()
# url = 'https://allsuvenir.com.ua/ru/glavnaya/1344-mk14.html?search_query=MK14&results=2'
# print(url)
async with session.get(url, headers=headers)as response:
# await asyncio.sleep(0.2)
r = await aiohttp.StreamReader.read(response.content)
soup = BeautifulSoup(r, 'html.parser')
print(f'===========> {url}')
try:
table = soup.find_all('div', {"id" : 'center_column'})
for item in table:
json_list = []
# js = []
try:
name = item.find('div', class_='pb-center-column col-xs-12 col-sm-4').find('h1').get_text(strip=True)
print(name)
code = item.find('p', {"id" : 'product_reference'}).get_text(strip=True)
codes = code.replace('Артикул', '')
try:
ops = item.find('div', {"id" : 'short_description_content'}).get_text()
print(ops)
except Exception:
ops = ""
prices = item.find('span', {"id" : 'our_price_display'}).text
price = prices[:prices.find('$')]
fotos = item.find('ul', {"id" : 'thumbs_list_frame'})
foto1 = ( ' '.join(re.findall(r'"other-views" href="([^<>]+)" title="', str(fotos))) )
foto = foto1.replace(' ', ', ')
js1 = {'code': f'ALL_{codes}',
'name': name,
'price': price,
'foto': foto,
'op': ops}
with open("./tovar.json", "a", encoding='utf8') as file:
json.dump(js1, file, ensure_ascii=False)
file.write(',' + '\n')
with open("./urltovar.txt", "r", encoding='utf8') as f:
lines = f.readlines()
pattern = re.compile(re.escape(url))
with open("./urltovar.txt", "w", encoding='utf8') as f:
for line in lines:
result = pattern.search(line)
if result is None:
f.write(line)
print(codes)
except Exception as a:
# else:
print(a)
except Exception as a:
# else:
print(a)
time.sleep(100)
print('____________________________________________________________')
pass
def search_id():
js = []
with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file:
rr = json.load(file)
for r in rr:
codes = r['code']
names = r['name']
price = float(r['price'])
foto = r['foto']
op = r['op']
# print(op)
ange_name = 'Export Products Sheet!H1:H400'
spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc' # КОД страницы)
range_ = 'Export Products Sheet!AF1:AF15000'
# value_render_option = '40'
date_time_render_option = ''
request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option)
response = request.execute()
# print(type(response))
a = (response['values'])
search_ids = [f'{codes}']
# print(search_ids)
time.sleep(1)
if search_ids in a:
# b = (a.index( search_ids) )
# print('go search')
for i, j in enumerate(a):
if j == search_ids:
# print(i+1)
stroka = i+1
print(f'[=>] Нашёл в таблице! Код товара: {codes} В стороке: {stroka} Цена: {price}')
if price > 0.00:
tovar = [['+']]
price1 = [[f'{price}']]
resp = spreadsheet_service.spreadsheets().values().update(
spreadsheetId=spreadsheet_id,
range="Export Products Sheet!P{}".format(stroka),
valueInputOption="RAW",
body={'values' : tovar }).execute()
resp2 = spreadsheet_service.spreadsheets().values().update(
spreadsheetId=spreadsheet_id,
range="Export Products Sheet!Z{}".format(stroka),
valueInputOption="RAW",
body={'values' : price1 }).execute()
elif price == 0.00:
tovar = [['-']]
price1 = [[f'{price}']]
resp = spreadsheet_service.spreadsheets().values().update(
spreadsheetId=spreadsheet_id,
range="Export Products Sheet!P{}".format(stroka),
valueInputOption="RAW",
body={'values' : tovar }).execute()
resp2 = spreadsheet_service.spreadsheets().values().update(
spreadsheetId=spreadsheet_id,
range="Export Products Sheet!Z{}".format(stroka),
valueInputOption="RAW",
body={'values' : price1 }).execute()
# print ('-------!!!!!!!')
else:
print('not')
else:
#
# pass
js1 = {'code': codes,
'name': names,
'price': price,
'foto': foto,
'op': op}
js.append(js1)
# print(js)
DF = pd.DataFrame.from_dict(js)
DF.to_excel( './all_notfiles.xlsx', sheet_name='no product', index=False)
# print(f'[=>] Записал в файл {codes}')
def find_elements_starting_with(a, prefix):
return [s for s in a if s.startswith(prefix)]
def finds_ids():
# with open("./idsexes.txt", "w", encoding='utf8') as file:
# file.write(f'')
lis = []
valuesss = [[int(0)]]
with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file:
rr = json.load(file)
for r in rr:
codes1 = r['code']
codes = f'ALL_{codes1}'
lis.append(codes)
# print(lis)
ange_name = 'Export Products Sheet!H1:H400'
spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc'#'1c5gAmgzLqbNfAC6a4DL7F4KDPXI5xn3joD0Z8Kxr-jY' # КОД страницы)
range_ = 'Export Products Sheet!AF2:AF15000'
# value_render_option = '40'
date_time_render_option = ''
request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option)
response = request.execute()
# print(type(response))
a = (response['values'])
time.sleep(2)
st = str(a).replace('[', '')
st1 = st.replace(']', '')
st2 = st1.replace("'", '')
li = list(st2.split(", "))
prefix = 'ALL'
resultgoog = find_elements_starting_with(li, prefix)
result = []
# print(a)
for iii in resultgoog:
if iii not in lis:
result.append(iii)
search_ids = [f'{iii}']
time.sleep(2)
if search_ids in a:
# b = (a.index( search_ids) )
# print('go search')
for i, j in enumerate(a):
# time.sleep(1)
if j == search_ids:
# print(i+1)
stroka = i+2
print(f'[=>] Нашёл в таблице! Код товара: {search_ids} В стороке: {stroka}! Этого товара не было при парсинге!')
tovar = [['-']]
resp = spreadsheet_service.spreadsheets().values().update(
spreadsheetId=spreadsheet_id,
range="Export Products Sheet!P{}".format(stroka),
valueInputOption="RAW",
body={'values' : tovar }).execute()
# resp = spreadsheet_service.spreadsheets().values().update(
# spreadsheetId=spreadsheet_id,
# range="Export Products Sheet!Z{}".format(stroka),
# valueInputOption="RAW",
# body={'values' : valuesss }).execute()
if __name__ == '__main__':
files_tovar()
catalog()
prov_pagen()
urls_tovar()
files_jso()
print('[*] Проверяю что осталось в файле')
time.sleep(10)
check_finish1 = True
while check_finish1:
a = len(re.findall(r"[\n]+?", open('./urltovar.txt').read()))
print(a)
if a > 1:
try:
loop = asyncio.get_event_loop()
loop.run_until_complete(main1())
check_finish1 = False
except Exception:
print('[*]Новое подключние')
time.sleep(10)
else:
break
# loop = asyncio.get_event_loop()
# loop.run_until_complete(main1())
files_js()
try:
search_id()
except Exception as e:
print(e)
print('[*] Не залил весь товар.')
#
try:
finds_ids()
except Exception as e:
print(e)
print('[*] Не проверил весь товар которого не было на сайте.')