first commit
This commit is contained in:
commit
2af75cd9c6
|
@ -0,0 +1,26 @@
|
||||||
|
# Игнорировать файлы с расширением .log и .tmp
|
||||||
|
*.log
|
||||||
|
*.tmp
|
||||||
|
*.html
|
||||||
|
# Игнорировать директорию с именем temp
|
||||||
|
temp/
|
||||||
|
|
||||||
|
# Игнорировать файл config.ini
|
||||||
|
config.ini
|
||||||
|
|
||||||
|
# Игнорировать все файлы в директории temp_folder
|
||||||
|
temp_folder/*
|
||||||
|
|
||||||
|
# Игнорировать все файлы и поддиректории в директории logs
|
||||||
|
logs/*
|
||||||
|
.venv/
|
||||||
|
test.py
|
||||||
|
prov.py
|
||||||
|
all_notfiles.xlsx
|
||||||
|
creds_service_acc.json
|
||||||
|
*.zip
|
||||||
|
*.db
|
||||||
|
*.txt
|
||||||
|
*.json
|
||||||
|
# Игнорировать все файлы и директории, кроме файла important.txt и директории docs
|
||||||
|
!/requirements.txt
|
|
@ -0,0 +1,39 @@
|
||||||
|
# Парсер для сайта https://allsuvenir.com.ua/ru/
|
||||||
|
|
||||||
|
* 👉 Парсим каталог
|
||||||
|
* 👉 Потом с каталогов весь товар
|
||||||
|
* 👉 Потом в файл JSON
|
||||||
|
* 👉 Потом в Гугл таблицу
|
||||||
|
|
||||||
|
* 👉 Товара которого нет в таблице записываем в отдельную страницу
|
||||||
|
* 👉 И товара которого не было в парсинге ставим "-" В таблице
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m venv venv
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
C:/python/NAME/venv/Scripts/Activate.ps1
|
||||||
|
```
|
||||||
|
```
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
```
|
||||||
|
pip install pyinstaller
|
||||||
|
```
|
||||||
|
```
|
||||||
|
pyinstaller --onefile --console on.py
|
||||||
|
```
|
||||||
|
или
|
||||||
|
```
|
||||||
|
pip install auto-py-to-exe
|
||||||
|
```
|
||||||
|
```
|
||||||
|
auto-py-to-exe
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
> Что нужно не забыть!
|
||||||
|
> creds_service_acc.json У каждого свой! Это для гугл таблицы!
|
|
@ -0,0 +1,11 @@
|
||||||
|
import subprocess
|
||||||
|
def start():
|
||||||
|
# Замените 'your_script.py' на имя вашего файла
|
||||||
|
file_to_run = 'on.py'
|
||||||
|
|
||||||
|
# Запускаем файл
|
||||||
|
subprocess.run(['python', file_to_run])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
start()
|
|
@ -0,0 +1,387 @@
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import time
|
||||||
|
from googleapiclient.discovery import build
|
||||||
|
from google.oauth2 import service_account
|
||||||
|
import pandas as pd
|
||||||
|
import openpyxl
|
||||||
|
|
||||||
|
|
||||||
|
SCOPES = [
|
||||||
|
'https://www.googleapis.com/auth/spreadsheets',
|
||||||
|
'https://www.googleapis.com/auth/drive'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
credentials = service_account.Credentials.from_service_account_file('creds_service_acc.json', scopes=SCOPES)
|
||||||
|
|
||||||
|
# spreadsheet_service = build('sheets', 'v4', credentials=credentials)
|
||||||
|
# drive_service = build('drive', 'v3', credentials=credentials)
|
||||||
|
|
||||||
|
try:
|
||||||
|
spreadsheet_service = build('sheets', 'v4', credentials=credentials)
|
||||||
|
except:
|
||||||
|
DISCOVERY_SERVICE_URL = 'https://sheets.googleapis.com/$discovery/rest?version=v4'
|
||||||
|
spreadsheet_service = build('sheets', 'v4', credentials=credentials, discoveryServiceUrl=DISCOVERY_SERVICE_URL)
|
||||||
|
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
|
||||||
|
|
||||||
|
def files_jso():
|
||||||
|
with open("./tovar.json", "w", encoding='utf8') as file:
|
||||||
|
file.write(f'[')
|
||||||
|
|
||||||
|
|
||||||
|
def files_js():
|
||||||
|
|
||||||
|
with open("./tovar.json", "r", encoding='utf8') as file:
|
||||||
|
x = file.read()
|
||||||
|
|
||||||
|
with open("./tovar.json", "w", encoding='utf8') as file1:
|
||||||
|
file1.write(x[:-2])
|
||||||
|
|
||||||
|
with open("./tovar.json", "a", encoding='utf8') as file3:
|
||||||
|
file3.write(f']')
|
||||||
|
|
||||||
|
def files_tovar():
|
||||||
|
with open("./urltovar.txt", "w", encoding='utf8') as file:
|
||||||
|
file.write(f'')
|
||||||
|
with open("./catalogall.txt", "w", encoding='utf8') as file:
|
||||||
|
file.write(f'')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def prov_pagen():
|
||||||
|
file1 = open('./catalog.txt', "r")
|
||||||
|
while True: # Цикл для блокнота что бы пробегал по каждой строке)
|
||||||
|
line = file1.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
url = line.strip()
|
||||||
|
s = requests.Session()
|
||||||
|
r = s.get(url, headers=headers).text
|
||||||
|
soup = BeautifulSoup(r, 'html.parser')
|
||||||
|
pagen = soup.find_all('select', {"id" : 'nb_page_items'})
|
||||||
|
p = str(pagen)
|
||||||
|
u = ( ' '.join(re.findall(r'>([^<>]+)</option> </select>', str(p))) )
|
||||||
|
# print(u)
|
||||||
|
if len(u) == 0:
|
||||||
|
print(url)
|
||||||
|
with open("./catalogall.txt", "a") as file:
|
||||||
|
file.write(url + '\n')
|
||||||
|
else:
|
||||||
|
print(f'{url}?n={u}')
|
||||||
|
with open("./catalogall.txt", "a") as file:
|
||||||
|
file.write(f'{url}?n={u}' + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def catalog():
|
||||||
|
url='https://allsuvenir.com.ua/ru/'
|
||||||
|
s = requests.Session()
|
||||||
|
res = s.get(url, headers=headers).text
|
||||||
|
|
||||||
|
|
||||||
|
soup = BeautifulSoup(res, 'html.parser')
|
||||||
|
items = soup.find_all('div', class_= 'list')
|
||||||
|
# print(items)
|
||||||
|
url = ( ' '.join(re.findall(r'<a href="([^<>]+)" title="">', str(items))) )
|
||||||
|
line = url.replace(' ', '\n')
|
||||||
|
# catalog = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line)
|
||||||
|
# print(len(catalog))
|
||||||
|
with open("./catalog.txt", "w") as file:
|
||||||
|
file.write(line)
|
||||||
|
|
||||||
|
|
||||||
|
def urls_tovar():
|
||||||
|
file1 = open('./catalogall.txt', "r")
|
||||||
|
while True: # Цикл для блокнота что бы пробегал по каждой строке)
|
||||||
|
line = file1.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
url = line.strip()
|
||||||
|
# url = url1+'&n=135'
|
||||||
|
print(url)
|
||||||
|
# url = 'https://allsuvenir.com.ua/ru/114-zazhigalki?id_category=114&n=135'
|
||||||
|
s = requests.Session()
|
||||||
|
r = s.get(url, headers=headers).text
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r, 'html.parser')
|
||||||
|
items = soup.find_all('ul', class_='product_list grid row')
|
||||||
|
# print(items)
|
||||||
|
|
||||||
|
url = ( ' '.join(re.findall(r'"product-name" href="([^<>]+)" itemprop="url"', str(items))) )
|
||||||
|
line = url.replace(' ', '\n')
|
||||||
|
# tovar = re.findall(r'"\s*([^"]*?)\s*"', line)#json.loads(line)
|
||||||
|
print(line)
|
||||||
|
with open("./urltovar.txt", "a") as file:
|
||||||
|
file.write(line + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
async def main1():
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
|
||||||
|
file1 = open('./urltovar.txt', "r")
|
||||||
|
while True: # Цикл для блокнота что бы пробегал по каждой строке)
|
||||||
|
line = file1.readline()
|
||||||
|
if not line:
|
||||||
|
|
||||||
|
break
|
||||||
|
url = line.strip()
|
||||||
|
# url = 'https://allsuvenir.com.ua/ru/glavnaya/1344-mk14.html?search_query=MK14&results=2'
|
||||||
|
|
||||||
|
# print(url)
|
||||||
|
async with session.get(url, headers=headers)as response:
|
||||||
|
# await asyncio.sleep(0.2)
|
||||||
|
|
||||||
|
|
||||||
|
r = await aiohttp.StreamReader.read(response.content)
|
||||||
|
soup = BeautifulSoup(r, 'html.parser')
|
||||||
|
print(f'===========> {url}')
|
||||||
|
try:
|
||||||
|
table = soup.find_all('div', {"id" : 'center_column'})
|
||||||
|
for item in table:
|
||||||
|
json_list = []
|
||||||
|
# js = []
|
||||||
|
try:
|
||||||
|
name = item.find('div', class_='pb-center-column col-xs-12 col-sm-4').find('h1').get_text(strip=True)
|
||||||
|
print(name)
|
||||||
|
code = item.find('p', {"id" : 'product_reference'}).get_text(strip=True)
|
||||||
|
codes = code.replace('Артикул', '')
|
||||||
|
|
||||||
|
try:
|
||||||
|
ops = item.find('div', {"id" : 'short_description_content'}).get_text()
|
||||||
|
print(ops)
|
||||||
|
except Exception:
|
||||||
|
ops = ""
|
||||||
|
|
||||||
|
|
||||||
|
prices = item.find('span', {"id" : 'our_price_display'}).text
|
||||||
|
price = prices[:prices.find('$')]
|
||||||
|
fotos = item.find('ul', {"id" : 'thumbs_list_frame'})
|
||||||
|
foto1 = ( ' '.join(re.findall(r'"other-views" href="([^<>]+)" title="', str(fotos))) )
|
||||||
|
foto = foto1.replace(' ', ', ')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
js1 = {'code': f'ALL_{codes}',
|
||||||
|
'name': name,
|
||||||
|
'price': price,
|
||||||
|
'foto': foto,
|
||||||
|
'op': ops}
|
||||||
|
|
||||||
|
|
||||||
|
with open("./tovar.json", "a", encoding='utf8') as file:
|
||||||
|
|
||||||
|
json.dump(js1, file, ensure_ascii=False)
|
||||||
|
file.write(',' + '\n')
|
||||||
|
with open("./urltovar.txt", "r", encoding='utf8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
pattern = re.compile(re.escape(url))
|
||||||
|
with open("./urltovar.txt", "w", encoding='utf8') as f:
|
||||||
|
for line in lines:
|
||||||
|
result = pattern.search(line)
|
||||||
|
if result is None:
|
||||||
|
f.write(line)
|
||||||
|
|
||||||
|
print(codes)
|
||||||
|
except Exception as a:
|
||||||
|
# else:
|
||||||
|
print(a)
|
||||||
|
except Exception as a:
|
||||||
|
# else:
|
||||||
|
print(a)
|
||||||
|
time.sleep(100)
|
||||||
|
print('____________________________________________________________')
|
||||||
|
pass
|
||||||
|
|
||||||
|
def search_id():
|
||||||
|
js = []
|
||||||
|
with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file:
|
||||||
|
rr = json.load(file)
|
||||||
|
for r in rr:
|
||||||
|
codes = r['code']
|
||||||
|
names = r['name']
|
||||||
|
price = float(r['price'])
|
||||||
|
foto = r['foto']
|
||||||
|
op = r['op']
|
||||||
|
# print(op)
|
||||||
|
|
||||||
|
ange_name = 'Export Products Sheet!H1:H400'
|
||||||
|
spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc' # КОД страницы)
|
||||||
|
range_ = 'Export Products Sheet!AF1:AF15000'
|
||||||
|
# value_render_option = '40'
|
||||||
|
date_time_render_option = ''
|
||||||
|
request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option)
|
||||||
|
response = request.execute()
|
||||||
|
|
||||||
|
# print(type(response))
|
||||||
|
a = (response['values'])
|
||||||
|
search_ids = [f'{codes}']
|
||||||
|
# print(search_ids)
|
||||||
|
time.sleep(1)
|
||||||
|
if search_ids in a:
|
||||||
|
|
||||||
|
# b = (a.index( search_ids) )
|
||||||
|
# print('go search')
|
||||||
|
|
||||||
|
for i, j in enumerate(a):
|
||||||
|
if j == search_ids:
|
||||||
|
# print(i+1)
|
||||||
|
stroka = i+1
|
||||||
|
print(f'[=>] Нашёл в таблице! Код товара: {codes} В стороке: {stroka} Цена: {price}')
|
||||||
|
if price > 0.00:
|
||||||
|
tovar = [['+']]
|
||||||
|
price1 = [[f'{price}']]
|
||||||
|
resp = spreadsheet_service.spreadsheets().values().update(
|
||||||
|
spreadsheetId=spreadsheet_id,
|
||||||
|
range="Export Products Sheet!P{}".format(stroka),
|
||||||
|
valueInputOption="RAW",
|
||||||
|
body={'values' : tovar }).execute()
|
||||||
|
resp2 = spreadsheet_service.spreadsheets().values().update(
|
||||||
|
spreadsheetId=spreadsheet_id,
|
||||||
|
range="Export Products Sheet!Z{}".format(stroka),
|
||||||
|
valueInputOption="RAW",
|
||||||
|
body={'values' : price1 }).execute()
|
||||||
|
|
||||||
|
elif price == 0.00:
|
||||||
|
tovar = [['-']]
|
||||||
|
price1 = [[f'{price}']]
|
||||||
|
resp = spreadsheet_service.spreadsheets().values().update(
|
||||||
|
spreadsheetId=spreadsheet_id,
|
||||||
|
range="Export Products Sheet!P{}".format(stroka),
|
||||||
|
valueInputOption="RAW",
|
||||||
|
body={'values' : tovar }).execute()
|
||||||
|
resp2 = spreadsheet_service.spreadsheets().values().update(
|
||||||
|
spreadsheetId=spreadsheet_id,
|
||||||
|
range="Export Products Sheet!Z{}".format(stroka),
|
||||||
|
valueInputOption="RAW",
|
||||||
|
body={'values' : price1 }).execute()
|
||||||
|
# print ('-------!!!!!!!')
|
||||||
|
else:
|
||||||
|
print('not')
|
||||||
|
|
||||||
|
|
||||||
|
else:
|
||||||
|
#
|
||||||
|
# pass
|
||||||
|
js1 = {'code': codes,
|
||||||
|
'name': names,
|
||||||
|
'price': price,
|
||||||
|
'foto': foto,
|
||||||
|
'op': op}
|
||||||
|
js.append(js1)
|
||||||
|
# print(js)
|
||||||
|
DF = pd.DataFrame.from_dict(js)
|
||||||
|
DF.to_excel( './all_notfiles.xlsx', sheet_name='no product', index=False)
|
||||||
|
# print(f'[=>] Записал в файл {codes}')
|
||||||
|
|
||||||
|
|
||||||
|
def find_elements_starting_with(a, prefix):
|
||||||
|
return [s for s in a if s.startswith(prefix)]
|
||||||
|
|
||||||
|
def finds_ids():
|
||||||
|
# with open("./idsexes.txt", "w", encoding='utf8') as file:
|
||||||
|
# file.write(f'')
|
||||||
|
|
||||||
|
|
||||||
|
lis = []
|
||||||
|
valuesss = [[int(0)]]
|
||||||
|
with open("./tovar.json", "r", encoding='utf8') as file:#,encoding='utf8') as file:
|
||||||
|
rr = json.load(file)
|
||||||
|
for r in rr:
|
||||||
|
codes1 = r['code']
|
||||||
|
codes = f'ALL_{codes1}'
|
||||||
|
lis.append(codes)
|
||||||
|
# print(lis)
|
||||||
|
ange_name = 'Export Products Sheet!H1:H400'
|
||||||
|
spreadsheet_id = '1KQUjDeeOE8fIC1DlA7N8VL32Ee4Wh0CgC28p3cggsuc'#'1c5gAmgzLqbNfAC6a4DL7F4KDPXI5xn3joD0Z8Kxr-jY' # КОД страницы)
|
||||||
|
range_ = 'Export Products Sheet!AF2:AF15000'
|
||||||
|
# value_render_option = '40'
|
||||||
|
date_time_render_option = ''
|
||||||
|
request = spreadsheet_service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_)#, valueRenderOption=value_render_option)#, dateTimeRenderOption=date_time_render_option)
|
||||||
|
response = request.execute()
|
||||||
|
|
||||||
|
# print(type(response))
|
||||||
|
a = (response['values'])
|
||||||
|
time.sleep(2)
|
||||||
|
st = str(a).replace('[', '')
|
||||||
|
st1 = st.replace(']', '')
|
||||||
|
st2 = st1.replace("'", '')
|
||||||
|
|
||||||
|
li = list(st2.split(", "))
|
||||||
|
prefix = 'ALL'
|
||||||
|
resultgoog = find_elements_starting_with(li, prefix)
|
||||||
|
result = []
|
||||||
|
# print(a)
|
||||||
|
for iii in resultgoog:
|
||||||
|
if iii not in lis:
|
||||||
|
result.append(iii)
|
||||||
|
search_ids = [f'{iii}']
|
||||||
|
time.sleep(2)
|
||||||
|
if search_ids in a:
|
||||||
|
|
||||||
|
# b = (a.index( search_ids) )
|
||||||
|
# print('go search')
|
||||||
|
|
||||||
|
for i, j in enumerate(a):
|
||||||
|
# time.sleep(1)
|
||||||
|
if j == search_ids:
|
||||||
|
# print(i+1)
|
||||||
|
stroka = i+2
|
||||||
|
print(f'[=>] Нашёл в таблице! Код товара: {search_ids} В стороке: {stroka}! Этого товара не было при парсинге!')
|
||||||
|
tovar = [['-']]
|
||||||
|
resp = spreadsheet_service.spreadsheets().values().update(
|
||||||
|
spreadsheetId=spreadsheet_id,
|
||||||
|
range="Export Products Sheet!P{}".format(stroka),
|
||||||
|
valueInputOption="RAW",
|
||||||
|
body={'values' : tovar }).execute()
|
||||||
|
|
||||||
|
# resp = spreadsheet_service.spreadsheets().values().update(
|
||||||
|
# spreadsheetId=spreadsheet_id,
|
||||||
|
# range="Export Products Sheet!Z{}".format(stroka),
|
||||||
|
# valueInputOption="RAW",
|
||||||
|
# body={'values' : valuesss }).execute()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
files_tovar()
|
||||||
|
catalog()
|
||||||
|
prov_pagen()
|
||||||
|
urls_tovar()
|
||||||
|
files_jso()
|
||||||
|
print('[*] Проверяю что осталось в файле')
|
||||||
|
time.sleep(10)
|
||||||
|
check_finish1 = True
|
||||||
|
while check_finish1:
|
||||||
|
a = len(re.findall(r"[\n]+?", open('./urltovar.txt').read()))
|
||||||
|
print(a)
|
||||||
|
if a > 1:
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(main1())
|
||||||
|
check_finish1 = False
|
||||||
|
except Exception:
|
||||||
|
print('[*]Новое подключние')
|
||||||
|
time.sleep(10)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
# loop = asyncio.get_event_loop()
|
||||||
|
# loop.run_until_complete(main1())
|
||||||
|
files_js()
|
||||||
|
# try:
|
||||||
|
# search_id()
|
||||||
|
# except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
# print('[*] Не залил весь товар.')
|
||||||
|
# #
|
||||||
|
# try:
|
||||||
|
# finds_ids()
|
||||||
|
# except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
# print('[*] Не проверил весь товар которого не было на сайте.')
|
||||||
|
|
|
@ -0,0 +1,37 @@
|
||||||
|
aiohttp==3.8.3
|
||||||
|
aiosignal==1.3.1
|
||||||
|
async-timeout==4.0.2
|
||||||
|
attrs==22.2.0
|
||||||
|
beautifulsoup4==4.11.1
|
||||||
|
cachetools==5.3.0
|
||||||
|
certifi==2022.12.7
|
||||||
|
charset-normalizer==2.1.1
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
frozenlist==1.3.3
|
||||||
|
google-api-core==2.11.0
|
||||||
|
google-api-python-client==2.74.0
|
||||||
|
google-auth==2.16.0
|
||||||
|
google-auth-httplib2==0.1.0
|
||||||
|
google-auth-oauthlib==0.8.0
|
||||||
|
googleapis-common-protos==1.58.0
|
||||||
|
httplib2==0.21.0
|
||||||
|
idna==3.4
|
||||||
|
multidict==6.0.4
|
||||||
|
numpy==1.24.1
|
||||||
|
oauthlib==3.2.2
|
||||||
|
openpyxl==3.0.10
|
||||||
|
pandas==1.5.3
|
||||||
|
protobuf==4.21.12
|
||||||
|
pyasn1==0.4.8
|
||||||
|
pyasn1-modules==0.2.8
|
||||||
|
pyparsing==3.0.9
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2022.7.1
|
||||||
|
requests==2.28.2
|
||||||
|
requests-oauthlib==1.3.1
|
||||||
|
rsa==4.9
|
||||||
|
six==1.16.0
|
||||||
|
soupsieve==2.3.2.post1
|
||||||
|
uritemplate==4.1.1
|
||||||
|
urllib3==1.26.14
|
||||||
|
yarl==1.8.2
|
Loading…
Reference in New Issue