【AI CAMP 三期】 第二组作业

AI CAMP 三期 第二组作业汇总贴

第一周作业
  • 针对Imagenet数据(s3://ai-cultivate/1percent_ImageNet.txt),进行基本的数据统计(统计框/图片的平均,最大,最小宽高),数据预处理(对数据做旋转增强,randomcrop),并做可视化分析对比数据处理前后的变化
  • 针对外部同学,不指定图片来源,可以使用任意来源的100张图片,进行基本的数据统计(统计框/图片的平均,最大,最小宽高),数据预处理(对数据做旋转增强,randomcrop),并做可视化分析对比数据处理前后的变化
第一周作业 - 余立春
from refile import smart_open
import nori2 as nori
from meghair.utils.imgproc import imdecode
import cv2
import numpy as np
from imgaug import augmenters as iaa

'''
    统计图片长宽信息
    对图片做旋转增强, randomcrop
'''

if __name__ == "__main__":
    fpath = "s3://ai-cultivate/1percent_ImageNet.txt"
    fetcher = nori.Fetcher()
    shapes = []
    i = 0
    with smart_open(fpath) as f:
        for line in f:
            nori_id = line.split("\t")[0]
            img_bytes = fetcher.get(nori_id)
            img_npy = imdecode(img_bytes)

            # print(img_npy)
            print(img_npy.shape)  # like: (515, 600, 3)
            # shape属性表示图像的大小,会返回tuple元组。
            # 第一个元素表示矩阵行数,第二个元组表示矩阵列数,第三个元素是3,表示像素值由光的三原色组成。
            shapes.append(img_npy.shape)
            i += 1
            if i == 100:  # 取前100张统计
                break

    shapes = np.array(shapes)
    print(shapes.shape)  # 输出 (100, 3)
    # axis=0表示输出矩阵是1行,也就是求每一列的平均值。
    # axis=1表示输出矩阵是1列, 也就是求每一行的平均值。
    # [:2] 表示只取 shapes每个元素(是一个tuple元组,有宽,高,颜色元素值组成)的前2个。
    print("avg: ", shapes.mean(axis=0)[:2])
    print("max: ", shapes.max(axis=0)[:2])
    print("min: ", shapes.min(0)[:2])

    # 只对最后一张图做下增强
    seq = iaa.Sequential(
        [
            # rotate = iaa.Affine(rotate=(-25, 25))实例化我们的仿射变换时,
            # 我们使用了一个旋转间隔,给定为(-25, 25),表示均匀分布旋转rotate ~ uniform(-25, 25)。
            # 我们也可以选择一个常数值rotate=-25来始终旋转-25°或list rotate=[-25, -15, 0]旋转-25°或-15°或0°。
            iaa.Affine(rotate=(-25, 25)),
            # 随机边缘 0-16像素进行crop
            iaa.Crop(px=(0, 16))
        ]
    )

    cv2.imwrite("work1_img.jpg", img_npy)
    cv2.imwrite("work1_img1.jpg", seq.augment_images([img_npy])[0])

    # img_augs = seq.augment_images([img_npy])
    # cv2.imshow("img", img_npy)
    # cv2.imshow("img_aug", img_augs[0])
    if cv2.waitKey(0) == 13:  # 在本地图片页面上敲入回车,退出程序
        cv2.destroyAllWindows()

work1_img work1_img1

第一周作业 – 朱胜

import cv2
import numpy as np
import matplotlib.pyplot as plt

import boto3
import nori2 as nori
from urllib.parse import urlparse

import imgaug as ia
import imgaug.augmenters as iaa

H,W = 320,240
LIMIT=20
NUM=6

fetcher = nori.Fetcher()
s3client = boto3.client('s3', endpoint_url="http://oss.i.brainpp.cn")

ia.seed(1)
seq = iaa.Sequential([
    iaa.Rotate((-45, 45)),
    iaa.Crop(percent=(0,0.1)),
    iaa.Resize({"height":H, "width":W})
], random_order=True)


def oss_get(path):
    url = urlparse(path)
    obj = s3client.get_object(Bucket=url.netloc, Key=url.path.lstrip("/"))
    return obj['Body'].read()

def nori_get_image(nori_id):
    buf = fetcher.get(nori_id)    
    img = cv2.imdecode(np.frombuffer(buf, dtype=np.uint8) , cv2.IMREAD_COLOR)
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

def show_img(imgs):
    canvas = np.zeros((H,(W+10)*len(imgs),3), dtype=np.uint8)
    for i, img in enumerate(imgs):
        canvas[:, i*(W+10):i*(W+10)+W,:] = img
    plt.figure(figsize=(3*len(imgs), 2))
    plt.imshow(canvas)
    plt.show()

object_url ='s3://ai-cultivate/1percent_ImageNet.txt'
lines = oss_get(object_url).decode("UTF-8").split('\n')
lines = [line.split() for line in lines if len(line) > 0]

# --- size ----
sizes = []
for i, line in enumerate(lines):
    if i % 1000 == 0:
        print(f"{i}/{len(lines)}")
    img = nori_get_image(line[0])
    sizes.append(img.shape)

sizes = np.array(sizes)
print("height mean:{} max:{} min:{}".format(np.mean(sizes[:,0]), np.max(sizes[:,0]), np.min(sizes[:,0])))
print("width mean:{} max:{} min:{}".format(np.mean(sizes[:,1]), np.max(sizes[:,1]), np.min(sizes[:,1])))

# --- show ---
for i, line in enumerate(lines[:LIMIT]):
    img = nori_get_image(line[0])
    images = np.array([img]*NUM, dtype=np.uint8)
    show_img(seq(images=images))

height mean:405.35805167434233 max:5065 min:21
width mean:471.59167902583715 max:4368 min:46

第一周作业 - 张阳坤

点击查看
# 要添加一个新单元,输入 '# %%'
# 要添加一个新的标记单元,输入 '# %% [markdown]'
# %%
from IPython.display import Image
from imgaug import augmenters as iaa
import random
import numpy as np
import cv2 as cv
import nori2 as nori
import csv
from io import BytesIO
import boto3
from IPython import get_ipython

# %% [markdown]
# # 作业描述
# 针对Imagenet数据(s3://ai-cultivate/1percent_ImageNet.txt),进行基本的数据统计(统计框/图片的平均,最大,最小宽高),数据预处理(对数据做旋转增强,randomcrop),并做可视化分析对比数据处理前后的变化

# %%
get_ipython().run_line_magic(
    'pip', 'install --user Augmentor imgaug boto3 nori2 opencv-python')


# %%
# Download image set data

host = "http://oss.i.brainpp.cn"
s3 = boto3.resource('s3', endpoint_url=host)
buf = BytesIO()
s3.Bucket("ai-cultivate").download_fileobj("1percent_ImageNet.txt", buf)

reader = csv.DictReader(buf.getvalue().decode('ascii').splitlines(), fieldnames=[
                        "nori_id", "?", "file_name"], delimiter="\t", lineterminator='\n')
images = [i for i in reader]


# %%
# Fetch images and collect size infomation


fetcher = nori.Fetcher()
shapes = np.empty((len(images), 2))
for i, image_info in enumerate(images):
    img_data = fetcher.get(image_info["nori_id"])
    img = cv.imdecode(np.fromstring(img_data, np.uint8), cv.IMREAD_UNCHANGED)
    shapes[i] = img.shape[0:2]
    if i % 1000 == 0:
        print(i)


# %%
# Calculate max min average sizes
print("Max height is {0[0]}, max width is {0[1]}".format(np.amax(shapes, 0)))
print("Min height is {0[0]}, min width is {0[1]}".format(np.amin(shapes, 0)))
print("Avg height is {0[0]}, avg width is {0[1]}".format(
    np.average(shapes, 0)))


# %%
# Process a image

chosen_image = random.choice(images)
print(
    f"Choose image \"{chosen_image['file_name']}\", nori_id {chosen_image['nori_id']}")
img_data = nori.Fetcher().get(chosen_image["nori_id"])
img = cv.imdecode(np.fromstring(img_data, np.uint8), cv.IMREAD_UNCHANGED)

NUM = 3  # transformation count
H, W, _ = img.shape

source_images = np.array([img]*NUM,  dtype=np.uint8)
seq = iaa.Sequential(
    children=[
        iaa.Rotate((-90, 90)),
        iaa.Crop((0, 20))
    ], random_order=True
)
augmented_images = seq(images=source_images)


# output_image = np.zeros((H,(NUM+1)*W,img.shape[2]), dtype=np.uint8)
# output_image[:, 0:W, :]=img
# for i, image in enumerate(augmented_images):
#     output_image[:,(i+1)*W:(i+2)*W,:]=image
output_image = np.concatenate((img, *augmented_images), axis=1)


# %%
_, buf = cv.imencode(".jpeg", output_image)
Image(data=buf)

第一周作业

from meghair.utils import io
from meghair.utils.imgproc import imdecode
import nori2 as nori
import cv2
import imgaug.augmenters as iaa
import numpy as np

sometimes = lambda aug: iaa.Sometimes(0.5, aug)
fetcher = nori.Fetcher()
f = open("../pics/1percent_ImageNet.txt")
i = 0
NUM = 4
seq = iaa.Sequential([
    iaa.Crop(px=(0,16)),
    sometimes(iaa.Affine(
    rotate=(-45,45),
    )),
], random_order=True)
w_max = 0
w_min = 10000
w_sum = 0
h_max = 0
h_min = 10000
h_sum = 0
while 1:
    line  = f.readline()
    if not line:
        break
    img = imdecode(fetcher.get(line.split()[0]))[...,:3]
    H = img.shape[0]
    W = img.shape[1]
    if H > h_max:
        h_max = H
    if H < h_min:
        h_min = H
    h_sum += H
    if W > w_max:
        w_max = W
    if W < w_min:
        w_min = W
    w_sum += W
    images = np.array(
        [img] * NUM,
        dtype=np.uint8,
    )
    write_img = np.zeros((H, (W+10)*NUM,3), dtype=np.uint8)
    images_aug = seq(images=images)
    for j, img in enumerate(images_aug):
        write_img[:, j*(W+10): j*(W+10)+W, :] = img
    cv2.imwrite("final_%d.jpg" % i, write_img)
    i += 1
f.close()
w_avg = w_sum/i
h_avg = h_sum/i
print("w_max", w_max, "w_min", w_min, "w_avg",w_avg)
print("h_max", h_max, "h_min", h_min, "h_avg", h_avg)
w_max 4368 w_min 46 w_avg 471.49918039185076
h_max 5065 h_min 21 h_avg 405.4505503083288




第一周作业 - 骆明

代码

import cv2
import refile
import nori2 as nori
import numpy as np
import imgaug as ia
import imgaug.augmenters as iaa
from meghair.utils import io
from meghair.utils.imgproc import imdecode

s3_file_path = "s3://ai-cultivate/1percent_ImageNet.txt"

ia.seed(1)
H, W = 256, 128

seq = iaa.Sequential([
    iaa.Fliplr(0.5),  # horizontal flips
    iaa.Affine(rotate=(-30, 30)),  # rotate
    iaa.Crop(px=(0, 16)),  # random crops
    iaa.Resize({"height": H, "width": W})
], random_order=True)  # apply augmenters in random order


def read_imgs(path):
    fetcher = nori.Fetcher()
    img_list = []
    i = 0
    with refile.smart_open(path, encoding='utf-8') as f:
        for line in f:
            i += 1
            if i > 10:
                break
            nori_id, id, img_name = line.split()
            img = imdecode(fetcher.get(nori_id))[..., :3]
            img_info = {"id": nori_id, "img":img}
            img_list.append(img_info)
    return img_list


def statistics(img_list):
    img_height_list = []
    img_width_list = []
    for img_info in img_list:
        img_height_list.append(img_info["img"].shape[0])
        img_width_list.append(img_info["img"].shape[1])

    max_height = max(img_height_list)
    min_height = min(img_height_list)
    avg_height = np.mean(img_height_list)
    max_width = max(img_width_list)
    min_width = min(img_width_list)
    avg_width = np.mean(img_width_list)
    print("height:\n\tmax={}\tmin={}\tavg={}".format(max_height, min_height, avg_height))
    print("width:\n\tmax={}\tmin={}\tavg={}".format(max_width, min_width, avg_width))


def enhance_and_compare(img_info):
    img = img_info["img"]
    write_img = np.zeros((H, W * 2 + 10, 3), dtype=np.uint8)
    images = np.array([img], dtype=np.uint8)
    images_aug = seq(images=images)
    # 先写原图
    resized_img = cv2.resize(img, (W, H), interpolation=cv2.INTER_AREA)
    write_img[:, : W, :] = resized_img
    # 再写变换图
    for i, img in enumerate(images_aug):
        write_img[:, (i + 1) * (W + 10): (i + 1) * (W + 10) + W, :] = img
    cv2.imwrite("1_homework/{}.jpg".format(img_info["id"]), write_img)


if __name__ == '__main__':
    img_list = read_imgs(s3_file_path)
    statistics(img_list)
    enhance_and_compare(img_list[0])
    enhance_and_compare(img_list[1])
    enhance_and_compare(img_list[2])

输出结果

height:
	max=515	min=176	avg=366.1
width:
	max=640	min=167	avg=433.8

选取3张图片做变换,并与原图比对
1261,1e9700036e2c61c 1261,95a000bc650e68 1261,126e0006dc3eed4

第一周作业 - 李乐亮

import cv2
import imgaug as ia
import imgaug.augmenters as iaa
import nori2 as nori
import numpy as np
import refile
from meghair.utils.imgproc import imdecode

fetcher = nori.Fetcher()

ia.seed(1)
H, W = 256, 128
NUM = 6
seq = iaa.Sequential([
iaa.Crop(px=(0, 16)), # crop操作,随机在距离边缘(0-16)像素中选择crop范围
iaa.Affine(rotate=(-60, 60)), # rotate
iaa.Resize({“height”: H, “width”: W})
], random_order=True)

datas = refile.smart_open(“s3://ai-cultivate/1percent_ImageNet.txt”)
height_list = list()
width_list = list()
write_img = np.zeros((H, (W + 10) * (NUM + 1), 3), dtype=np.uint8)

for j, item in enumerate(datas):
img = imdecode(fetcher.get(item.split()[0]))[…, :3]
height, width, colors = img.shape
# statistics
height_list.append(height)
width_list.append(width)

# 图片只变化前NUM张
if j >= NUM:
    continue

images = np.array(
    [img] * NUM,
    dtype=np.uint8
)

imgHW = cv2.resize(img, (W, H))  # resize source image
write_img[:, 0:W:] = imgHW  # add source image 4 compare

images_ang = seq.augment_images(images=images)  # augment image
for i, img_aug in enumerate(images_ang):
    write_img[:, (i + 1) * (W + 10):(i + 1) * (W + 10) + W:] = img_aug
# 保存图片
cv2.imwrite("homework-image-{}.jpg".format(j), write_img)

print(“count:{} [Height]max:{}, min:{}, average:{}; [Width]max:{}, min:{}, average:{}”.format(
len(height_list), max(height_list), min(height_list), sum(height_list) / len(height_list),
max(width_list), min(width_list), sum(width_list) / len(width_list)
))




MegStudio平台挂了吗?

第一周作业 - 陈贻国

import cv2
import nori2 as nori
from meghair.utils.imgproc import imdecode
import imgaug as ia
import imgaug.augmenters as iaa
import refile
import numpy as np

fetcher = nori.Fetcher()

ia.seed(1)
H, W = 256, 128
NUM = 5

seq = iaa.Sequential([
    iaa.Fliplr(0.5),
    iaa.Crop(px=(0, 16)),
    iaa.Resize({"height": H, "width": W}),
    iaa.Affine(rotate=(-45, 45)),
], random_order=True)

shapes = []
imgs = []
datas = refile.smart_open("s3://ai-cultivate/1percent_ImageNet.txt")
for j, item in enumerate(datas):
    img = imdecode(fetcher.get(item.split()[0]))[..., :3]
    shapes.append(img.shape)
    if j % 1000 == 0:
        imgs.append(img)

shapes = np.array(shapes)
#print(shapes)
h_max = shapes.max(axis=0)[0]
h_mean = shapes.mean(axis=0)[0]
h_min = shapes.min(axis=0)[0]

w_max = shapes.max(axis=0)[1]
w_mean = shapes.mean(axis=0)[1]
w_min = shapes.min(axis=0)[1]

print("h_max: %f, h_min: %f, h_mean: %f \nw_max: %f, w_min: %f, w_mean: %f" %(h_max,h_min,h_mean,w_max,w_min,w_mean))

write_img = np.zeros((H, (W + 10) * (NUM + 1), 3), dtype=np.uint8)
for j, img in enumerate(imgs):
    img_array = np.array(
        [img] * NUM,
        dtype = np.uint8
    )
    img = cv2.resize(img, (W, H))
    write_img[:, 0:W:] = img
    img_augs = seq.augment_images(images=img_array)
    for i, img_aug in enumerate(img_augs):
        write_img[:, (i + 1) * (W + 10):(i + 1) * (W + 10) + W:] = img_aug
    cv2.imwrite("img"+str(j)+".jpg", write_img)

运行结果:
h_max: 5065.000000, h_min: 21.000000, h_mean: 405.450550
w_max: 4368.000000, w_min: 46.000000, w_mean: 471.499180
图片:


第一周作业 - 孙聪

第一周作业
import nori2 as nori
import cv2
import numpy as np

fetcher = nori.Fetcher()

def mkdir(path):
    import os
    path=path.strip()
    path=path.rstrip("/")
 
    isExists=os.path.exists(path)
    if not isExists:
        os.makedirs(path) 
        return True
    else:
        return False

def get_image(nori_id):
	buf = fetcher.get(nori_id)
	img = cv2.imdecode(np.frombuffer(buf, dtype=np.uint8) , cv2.IMREAD_COLOR)
	return img

def download_image():
	f = open('1percent_ImageNet.txt', 'r')
	data = f.read()
	lines = data.split('\n')
	for i in range(len(lines)):
		if i % 10 == 0:
			print(i)
		s = lines[i].split('\t')
		if len(s) == 3:
			mkdir('imgs/'+s[2].split('/')[0])
			get_image(s[0])
			cv2.imwrite('imgs/'+s[2], img)

def cat():
	shapes = []
	f = open('1percent_ImageNet.txt', 'r')
	data = f.read()
	lines = data.split('\n')
	for i in range(len(lines)):
		s = lines[i].split('\t')
		if len(s) == 3:
			img = get_image(s[0])
			shapes.append(img.shape)
	shapes = np.array(shapes)
	print("height: avg={} min={} max={}".format(np.average(shapes[:,0]), np.min(shapes[:,0]), np.max(shapes[:,0])))
	print("width: avg={} min={} max={}".format(np.average(shapes[:,1]), np.min(shapes[:,1]), np.max(shapes[:,1])))
#height: avg=405.35805167434233 min=21 max=5065
#width: avg=471.59167902583715 min=46 max=4368


def ppp():
	import imgaug as ia
	import imgaug.augmenters as iaa
	ia.seed(20210520)
	seq = iaa.Sequential([
		iaa.Rotate((-90, 90)),
		iaa.Crop(px=(0,16)),
		iaa.Resize({"height":256, "width":256})
	], random_order=True)
	f = open('1percent_ImageNet.txt', 'r')
	data = f.read()
	lines = data.split('\n')
	for i in range(10):
		s = lines[i].split('\t')
		if len(s) == 3:
			img = get_image(s[0])
			images = np.array([img] * 5, dtype=np.uint8)
			write_img = np.zeros((256, (256+10)*5,3), dtype=np.uint8)
			images_aug = seq(images=images)
			for j, img in enumerate(images_aug):
				write_img[:, j*(256+10): j*(256+10)+256, :] = img
				cv2.imwrite('final_{}.jpg'.format(i), write_img)

第三周作业 - 陈贻国

# run in windows
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import requests

keyword = 'Inflatable arch city street'
url = 'https://www.google.com.hk/search?q=' + keyword + '&tbm=isch'


class Crawler_google_images:
    def __init__(self):
        self.url = url

    def init_browser(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--disable-infobars")
        browser = webdriver.Chrome(chrome_options=chrome_options)
        browser.get(url)
        browser.maximize_window()
        return browser

    def download_images(self, browser, round=2, save_path='./default'):
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        img_url_dic = []

        pic_id = 0
        pos = 0
        for i in range(round):
            pos += 500
            js = 'var q=document.documentElement.scrollTop=' + str(pos)
            browser.execute_script(js)
            time.sleep(1)

            browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            # get button
            show_more_button = browser.find_element(By.CSS_SELECTOR, "input[value='显示更多搜索结果']")
            try:
                while True:
                    # do according to message
                    message = browser.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute(
                        'textContent')
                    # print(message)
                    if message == '正在加载更多内容,请稍候':
                        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                    elif message == '新内容已成功加载。向下滚动即可查看更多内容。':
                        # scrolling to bottom
                        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                        if show_more_button.is_displayed():
                            show_more_button.click()
                    elif message == '看来您已经看完了所有内容':
                        break
                    elif message == '无法加载更多内容,点击即可重试。':
                        show_more_button.click()
                    else:
                        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            except Exception as err:
                print(err)

            img_elements = browser.find_elements_by_tag_name('img')
            for img_element in img_elements:
                img_url = img_element.get_attribute('src')
                if isinstance(img_url, str) and len(
                        img_url) <= 200 and "images" in img_url and img_url not in img_url_dic:
                    try:
                        img_url_dic.append(img_url)
                        filename = "{}/{}.jpg".format(save_path, pic_id)
                        r = requests.get(img_url)
                        with open(filename, 'wb') as f:
                            f.write(r.content)
                        f.close()
                        print("scroll={}, filename={}".format(i, filename))
                        pic_id += 1
                        time.sleep(0.2)

                    except:
                        print('failure')
    def run(self, savepath):
        self.__init__()
        browser = self.init_browser()
        self.download_images(browser, 2, savepath)
        browser.close()
        print("爬取完成")

if __name__ == '__main__':
    craw = Crawler_google_images()
    craw.run("./inflatable_arch")


运行结果(Google 关键字:Inflatable arch city street):

scroll=1, filename=./inflatable_arch/805.jpg
scroll=1, filename=./inflatable_arch/806.jpg
爬取完成

共计806张图片,以下是部分截图:


以下是标注文档截图

第三周作业 - 骆明

代码

import os
import time
import requests
import urllib.request
from selenium import webdriver


# 获取Chrome驱动
def get_chrome(url):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--disable-infobars")
    chrome = webdriver.Chrome(chrome_options=chrome_options)
    chrome.get(url)
    # 最大化窗口,之后需要爬取窗口中所见的所有图片
    chrome.maximize_window()
    return chrome


def download_images(chrome, target_name, scroll_count=1, save_path='./default'):
    save_path_target = save_path + '/' + target_name
    if not os.path.exists(save_path_target):
        os.makedirs(save_path_target)

    # 需记录下载过的图片,以免重复
    img_url_dic = {}

    pic_id = 1  # 图片序号
    pos = 0
    for i in range(scroll_count):
        pos += 500
        # 向下滑动
        js = 'var q=document.documentElement.scrollTop=' + str(pos)
        chrome.execute_script(js)
        time.sleep(1)

        img_elements = chrome.find_elements_by_tag_name('img')
        for img_element in img_elements:
            img_url = img_element.get_attribute("src")
            # 过滤掉非图片的 url
            if isinstance(img_url, str) and len(img_url) <= 200 and "images" in img_url:
                if img_url not in img_url_dic:
                    img_url_dic[img_url] = True

                    filename = "{}/{}_{}.jpg".format(save_path_target, target_name, pic_id)
                    r = requests.get(img_url)
                    with open(filename, 'wb') as f:
                        f.write(r.content)
                    f.close()
                    print("scroll={}, filename={}".format(i, filename))

                    pic_id += 1

                    # 防止反爬
                    time.sleep(0.2)


if __name__ == '__main__':
    target_name_list = ["拱门", "充气拱门", "气球拱门", "拱门花架"]
    scroll_count = 50
    save_path = "./arch"

    for target_name in target_name_list:
        url_format = "https://www.google.com.hk/search?q={}&newwindow=1&safe=strict&hl=zh-CN&tbm=isch&oq={}&sclient=img"
        url = url_format.format(urllib.parse.quote(target_name, safe='/'), urllib.parse.quote(target_name, safe='/'))
        print(url)

        chrome = get_chrome(url)
        download_images(chrome, target_name, scroll_count, save_path)
        chrome.close()

输出结果

1 2 3 4 5

充气拱门检测标记需求文档

第三周作业 - 李乐亮
# coding=utf-8

import os
import time
import requests
import urllib.request
import json

KEYWORDS = '充气拱门'
SAVE_PATH = "./arches"
MAX_NB_PAGES = 50
FILE_NAME_PIR = 'inflatable-arches-'


def get_onepage_urls(onepageurl):
    pic_urls = []
    if not onepageurl:
        print('last page')
        return [], ''
    try:
        headers = {
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
        }
        r = requests.get(onepageurl, headers=headers)
        str = r.text
        json_result = json.loads(str)
    except Exception as e:
        print(e)
        return pic_urls

    dataset = json_result['data']
    for i, data in enumerate(dataset):
        if data != {}:
            pic_url = data['thumbURL']
            pic_urls.append(pic_url)
    return pic_urls


def down_pic(pic_urls, fanye_count, loc, kw):
    for i, pic_url in enumerate(pic_urls):
        try:
            pic = requests.get(pic_url, timeout=5)
            string = loc + dict_trans[kw] + '/' + FILE_NAME_PIR + str(fanye_count * 30 + i + 1) + '.jpg'
            print('第:' + string)
            with open(string, 'wb') as f:
                f.write(pic.content)
                # print(kw + 'Succeded at %s:' % str(i + 1) + "url:" + pic_url)
                f.close()
            pic = 0
        except Exception as e:
            print('Fail at %s: %s' % (FILE_NAME_PIR + str(i + 1), str(pic_url)))
            print(e)
            continue


if os.path.isdir(SAVE_PATH) is False:
    os.mkdir(SAVE_PATH)
dict_trans = {}

mk = 0
keywords = KEYWORDS.split('-')
for keyword in keywords:
    mk += 1
    dict_trans[keyword] = str(mk)

print(len(keywords))
print('Start Scrapping')
time.sleep(1)

all_pic_urls = set()
for keyword in keywords:
    print(keyword)
    time.sleep(1)
    keyword_ = dict_trans[keyword]

    if os.path.isdir(SAVE_PATH + keyword_) is True:
        pass
    else:
        os.mkdir(SAVE_PATH + keyword_)

    # 起始位置
    pn = 0
    url_init_first = r'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10279846706763970477&ipn=rj&ct=201326592&is=&fp=result&queryWord=' + urllib.parse.quote(
        keyword,
        safe='/') + '&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=' + urllib.parse.quote(
        keyword,
        safe='/') + '&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn={}&rn=30&gsm=5a&1605536028872='

    url_init = url_init_first
    print(url_init)
    pic_urls = set()

    fanye_count = 0

    while True:
        fanye_url = url_init_first.format(fanye_count * 30)
        onepage_urls = get_onepage_urls(fanye_url)
        print('Page%s' % fanye_count)
        if fanye_count > MAX_NB_PAGES:
            break
        down_pic(onepage_urls, fanye_count, SAVE_PATH, keyword)
        fanye_count += 1

    # all_pic_urls.update(pic_urls)
    time.sleep(1)

图片


需求文档


第三周作业

#date:2020.5.25
#author:pmy
#aim:爬取google图片

from selenium import webdriver
import time
import os
import requests

# 使用代理的方法 ,可以直接windows使用代理,不用这么麻烦
# browserOptions = webdriver.ChromeOptions()
# browserOptions.add_argument('--proxy-server=ip:port)
# browser = webdriver.Chrome(chrome_options=browserOptions)

#修改keyword便可以修改搜索关键词
keyword = 'Inflatable Arch'
url = 'https://www.google.com.hk/search?q='+keyword+'&tbm=isch'


class Crawler_google_images:
    # 初始化
    def __init__(self):
        self.url = url

    # 获得Chrome驱动,并访问url
    def init_browser(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--disable-infobars")
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        browser = webdriver.Chrome(chrome_options=chrome_options)
        # 访问url
        browser.get(self.url)
        # 最大化窗口,之后需要爬取窗口中所见的所有图片
        browser.maximize_window()
        return browser

    #下载图片
    def download_images(self, browser,round=2):
        picpath = './cat'
        # 路径不存在时创建一个
        if not os.path.exists(picpath): os.makedirs(picpath)
        # 记录下载过的图片地址,避免重复下载
        img_url_dic = []

        count = 0 #图片序号
        pos = 0
        for i in range(round):
            pos += 500
            # 向下滑动
            js = 'var q=document.documentElement.scrollTop=' + str(pos)
            browser.execute_script(js)
            time.sleep(1)
            # 找到图片
            # html = browser.page_source#也可以抓取当前页面的html文本,然后用beautifulsoup来抓取
            #直接通过tag_name来抓取是最简单的,比较方便

            img_elements = browser.find_elements_by_tag_name('img')
            #遍历抓到的webElement
            for img_element in img_elements:
                img_url = img_element.get_attribute('src')
                # 前几个图片的url太长,不是图片的url,先过滤掉,爬后面的
                if isinstance(img_url, str):
                    if len(img_url) <= 200:
                        #将干扰的goole图标筛去
                        if 'images' in img_url:
                            #判断是否已经爬过,因为每次爬取当前窗口,或许会重复
                            #实际上这里可以修改一下,将列表只存储上一次的url,这样可以节省开销,不过我懒得写了···
                            if img_url not in img_url_dic:
                                try:
                                    img_url_dic.append(img_url)
                                    #下载并保存图片到当前目录下
                                    filename = "./cat/" + str(count+388) + ".jpg"
                                    r = requests.get(img_url)
                                    with open(filename, 'wb') as f:
                                        f.write(r.content)
                                    f.close()
                                    count += 1
                                    print('this is '+str(count)+'st img')
                                    #防止反爬机制
                                    time.sleep(0.2)
                                except:
                                    print('failure')

    def run(self):
        self.__init__()
        browser = self.init_browser()
        self.download_images(browser,100)#可以修改爬取的页面数,基本10页是100多张图片
        browser.close()
        print("爬取完成")

 
if __name__ == '__main__':
    craw = Crawler_google_images()
    craw.run()

简单的bmk,没考虑属性差异

import os
import random 
import shutil
if __name__ == "__main__":
	train_rate = 0.6
	verify_rate = 0.2
	test_rate = 0.2
	origin_path = "/Users/weishouxin/Desktop/door"
	file_list = os.listdir(origin_path)
	if '.DS_Store' in file_list:
		file_list.remove('.DS_Store')
	data_num = len(file_list)
	print(data_num)
	train_num = int(train_rate * data_num)
	train_sample = random.sample(file_list, train_num)
	left_sample = list(set(file_list)-set(train_sample))
	verify_rate_2 = verify_rate/(verify_rate + test_rate)
	verify_num = int(len(left_sample) * verify_rate_2)
	verify_sample = random.sample(left_sample, verify_num)
	test_sample = list(set(left_sample) - set(verify_sample))
	sample = [train_sample, verify_sample, test_sample]
	save_dir = ["./train/", "./verify/", "./test/"]
	print(test_sample)
	for k in range(len(sample)):
		for name in sample[k]:
			if not os.path.isdir(save_dir[k]):
				os.makedirs(save_dir[k])
			shutil.copy(os.path.join(origin_path, name), os.path.join(save_dir[k], name))

标注文档

第三周作业 - 余立春
import requests
from lxml import etree
import os
from multiprocessing.dummy import Pool
import json
from time import time


# 作用:按关键字、图片数量爬取必应图片,存放到指定路径。
# 使用方法:只需运行一条命令 BingImagesSpider('电脑美女壁纸', 200, 'E:\images').run()
class BingImagesSpider:
    thread_amount = 10  # 线程池数量,线程池用于多IO请求,减少总的http请求时间
    per_page_images = 30  # 每页必应请求的图片数
    count = 0  # 图片计数
    success_count = 0
    # 忽略图片标签的一些字符
    ignore_chars = ['|', '.', ',', ',', '', '', '/', '@', ':', ':', ';', ';', '[', ']', '+']
    # 允许的图片类型
    image_types = ['bmp', 'jpg', 'png', 'tif', 'gif', 'pcx', 'tga', 'exif', 'fpx', 'svg', 'psd', 'cdr', 'pcd', 'dxf',
                   'ufo', 'eps', 'ai', 'raw', 'WMF', 'webp']
    # 请求头
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}
    # 必应图片 url
    bing_image_url_pattern = 'https://www.bing.com/images/async?q={}&first={}&count={}&mmasync=1'

    def __init__(self, keyword, amount, path='./'):
        # keyword: 需爬取的关键字
        # amount: 需爬取的数量
        # path: 图片存放路径
        self.keyword = keyword
        self.amount = amount
        self.path = path
        self.thread_pool = Pool(self.thread_amount)

    def __del__(self):
        self.thread_pool.close()
        self.thread_pool.join()

    # 作用:从必应请求图片
    def request_homepage(self, url):
        # url: 必应图片页的 url
        return requests.get(url, headers=self.headers)

    # 作用:解析必应网页,得到所有图片的信息,封装到列表中返回
    # 每个图片的信息以字典对象存储,字典的键包括 image_title, image_type, image_md5, image_url
    def parse_homepage_response(self, response):
        # response: 必应网站的响应

        # 获取各图片信息所在的json格式字符串 m
        tree = etree.HTML(response.text)
        m_list = tree.xpath('//*[@class="imgpt"]/a/@m')

        # 对每个图片分别处理
        info_list = []
        for m in m_list:
            dic = json.loads(m)

            # 去除一些文件名中不允许的字符
            image_title = dic['t']
            for char in self.ignore_chars:
                image_title = image_title.replace(char, ' ')
            image_title = image_title.strip()

            # 有些图片的信息中不包含图片格式,该情况将图片设置为 jpg 格式
            image_type = dic['murl'].split('.')[-1]
            if image_type not in self.image_types:
                image_type = 'jpg'

            # 将每个图片的信息存为字典格式
            info = dict()
            info['image_title'] = image_title
            info['image_type'] = image_type
            info['image_md5'] = dic['md5']
            info['image_url'] = dic['murl']

            info_list.append(info)
        return info_list

    # 请求具体图片,保存到初始化时指定的路径
    def request_and_save_image(self, info):
        # info: 每个图片的信息,以字典对象存储。字典的键包括 image_title, image_type, image_md5, image_url
        filename = '{} {}.{}'.format(self.count, info['image_title'], info['image_type'])
        filepath = os.path.join(self.path, filename)

        try:
            # 请求图片
            response = requests.get(info['image_url'], headers=self.headers, timeout=1.5)
            # 保存图片
            with open(filepath, 'wb') as fp:
                fp.write(response.content)
            # 打印日志
            self.count += 1
            self.success_count += 1
            print('{}: saving {} done.'.format(self.count, filepath))

        except requests.exceptions.RequestException as e:
            self.count += 1
            print('{}: saving {}failed. url: {}'.format(self.count, filepath, info['image_url']))
            print('\t tip:', e)

    # 作用:图片信息的列表去重,去除重复的图片信息
    def deduplication(self, info_list):
        result = []

        # 用图片的 md5 做为唯一标识符
        md5_set = set()
        for info in info_list:
            if info['image_md5'] not in md5_set:
                result.append(info)
                md5_set.add(info['image_md5'])
        return result

    # 作用:运行爬虫,爬取图片
    def run(self):
        # 创建用于保存图片的目录
        if not os.path.exists(self.path):
            os.mkdir(self.path)

        # 根据关键词和需要的图片数量,生成将爬取的必应图片网页列表
        homepage_urls = []
        for i in range(int(self.amount / self.per_page_images * 1.5) + 1):  # 由于有些图片会重复,故先请求1.5倍图片,豁免
            url = self.bing_image_url_pattern.format(self.keyword, i * self.per_page_images, self.per_page_images)
            homepage_urls.append(url)
        print('homepage_urls len {}'.format(len(homepage_urls)))

        # 通过线程池请求所有必应图片网页
        homepage_responses = self.thread_pool.map(self.request_homepage, homepage_urls)

        # 从必应网页解析所有图片的信息,每个图片包括 image_title, image_type, image_md5, image_url 等信息。
        info_list = []
        for response in homepage_responses:
            result = self.parse_homepage_response(response)
            info_list += result
        print('info amount before deduplication', len(info_list))

        # 删除重复的图片,避免重复下载
        info_list = self.deduplication(info_list)
        print('info amount after deduplication', len(info_list))
        info_list = info_list[: self.amount]
        print('info amount after split', len(info_list))

        # 下载所有图片,并保存
        self.thread_pool.map(self.request_and_save_image, info_list)
        print('all done. {} successfully downloaded, {} failed.'.format(self.success_count,
                                                                        self.count - self.success_count))


if __name__ == '__main__':
    # 关键词:电脑壁纸
    # 需要的图片数量:100
    # 图片保存路径:'E:\images'
    start = time()
    BingImagesSpider('气拱门', 1000, 'D:\\working\\t\\IMAGES').run()

第三周作业

下载图片直接使用 Microsoft Edge 加载项 | Download All Images