Lecture-02-Search-Policy-and-Simple-Machine-Learning

問題描述

大家好,我們在用 networkx 顯示中文的時候,會發現不能顯示中文。

解決辦法
  1. 下載Github倉庫中的字體SimHei.ttf;
  2. 在 jupyter notebook 中執行
import matplotlib
print(matplotlib.__path__)

找到 matplotlib 的路徑,然後 cd 到這個路徑。 cd 到這個路徑之後,繼續 cd,cd 到 mpl-data/fonts/ttf 這個路徑。 然後把 DejaVuSans.ttf 這個文件換成我們剛剛下在的文件。

$ mv SimHei.ttf DejaVuSans.ttf

其中,用到的 ttf 字體。 我給大家傳到附件上了。

Previous Course

  1. What’s the language model? OOV problem?
  2. Syntax Tree. lambda

pr(w) pr(w)

Out of Vocabulary(OOV)

Pr(w_o) = constant < Pr(w_1)

Language Model: Smooth

import random

"""adj* => adj* adj | adj null"""

def adj():  return random.choice('藍色的 | 好看的 | 小小的'.split('|')).split()[0]

def adj_star_2():
     return random.choice([lambda : '', lambda : adj() + adj_star_2()])()

def adj_star():
    return random.choice(['', ajd() + adj_star()])

lambda : adj() + adj_star_2()

numbers = [1, -2, -4, 1, 5, 6, 9, -11]

def absolute(number): 
    if number < 0: return -1 * number
    else:
        return number

absolute(-11)

def mod_5(number): 
    return number % 5

sorted(numbers, key=lambda x: x % 5)

mod_5_lambda = lambda x: x % 5

mod_5

mod_5_lambda(19)

mod_5(19)

adj_star_2()

## Search Policy

coordination_source = """
{name:'Lanzhou', geoCoord:[103.73, 36.03]},
{name:'Jiayuguan', geoCoord:[98.17, 39.47]},
{name:'Xining', geoCoord:[101.74, 36.56]},
{name:'Chengdu', geoCoord:[104.06, 30.67]},
{name:'Shijiazhuang', geoCoord:[114.48, 38.03]},
{name:'Lasa', geoCoord:[102.73, 25.04]},
{name:'Guiyang', geoCoord:[106.71, 26.57]},
{name:'Wuhan', geoCoord:[114.31, 30.52]},
{name:'Zhengzhou', geoCoord:[113.65, 34.76]},
{name:'Jinan', geoCoord:[117, 36.65]},
{name:'Nanjing', geoCoord:[118.78, 32.04]},
{name:'Hefei', geoCoord:[117.27, 31.86]},
{name:'Hangzhou', geoCoord:[120.19, 30.26]},
{name:'Nanchang', geoCoord:[115.89, 28.68]},
{name:'Fuzhou', geoCoord:[119.3, 26.08]},
{name:'Guangzhou', geoCoord:[113.23, 23.16]},
{name:'Changsha', geoCoord:[113, 28.21]},
//{name:'海口', geoCoord:[110.35, 20.02]},
{name:'Shengyang', geoCoord:[123.38, 41.8]},
{name:'Changchun', geoCoord:[125.35, 43.88]},
{name:'Haerbing', geoCoord:[126.63, 45.75]},
{name:'Taiyuan', geoCoord:[112.53, 37.87]},
{name:'Xian', geoCoord:[108.95, 34.27]},
//{name:'Taiwan', geoCoord:[121.30, 25.03]},
{name:'Beijing', geoCoord:[116.46, 39.92]},
{name:'Shanghai', geoCoord:[121.48, 31.22]},
{name:'Chongqing', geoCoord:[106.54, 29.59]},
{name:'Tianjing', geoCoord:[117.2, 39.13]},
{name:'Huhehaote', geoCoord:[111.65, 40.82]},
{name:'Nanning', geoCoord:[108.33, 22.84]},
//{name:'西藏', geoCoord:[91.11, 29.97]},
{name:'Yingchuan', geoCoord:[106.27, 38.47]},
{name:'Wulumuqi', geoCoord:[87.68, 43.77]},
{name:'Xianggang', geoCoord:[114.17, 22.28]},
{name:'Aomen', geoCoord:[113.54, 22.19]}
"""

city_location = {
    '香港': (114.17, 22.28)
}

Input: String -> dict

test_string = "{name:'蘭州', geoCoord:[103.73, 36.03]},"

import re

pattern = re.compile(r"name:'(\w+)',\s+geoCoord:\[(\d+.\d+),\s(\d+.\d+)\]")

for line in coordination_source.split('\n'):
    city_info = pattern.findall(line)
    if not city_info: continue
    
    # following: we find the city info
    
    city, long, lat = city_info[0]
    
    long, lat = float(long), float(lat)
    
    city_location[city] = (long, lat)

city_location

import math

def geo_distance(origin, destination):
    """
    Calculate the Haversine distance.

    Parameters
    ----------
    origin : tuple of float
        (lat, long)
    destination : tuple of float
        (lat, long)

    Returns
    -------
    distance_in_km : float

    Examples
    --------
    >>> origin = (48.1372, 11.5756)  # Munich
    >>> destination = (52.5186, 13.4083)  # Berlin
    >>> round(distance(origin, destination), 1)
    504.2
    """
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371  # km

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c

    return d

def get_geo_distance(city1, city2):
    return geo_distance(city_location[city1], city_location[city2])

get_geo_distance('Shanghai', 'Hangzhou')

import networkx as nx

city_graph = nx.Graph()

city_graph.add_nodes_from(list(city_location.keys()))

%matplotlib inline

import matplotlib.pyplot as plt

city_location

nx.draw(city_graph, city_location, with_labels=True, node_size=30)

threshold = 300

from collections import defaultdict

city_connection = defaultdict(list)

for c1 in city_location:
    for c2 in city_location:
        if c1 == c2: continue
        
        distance = get_geo_distance(c1, c2)
        
        if distance < threshold:
            city_connection[c1].append(c2)
            city_connection[c2].append(c1)

city_connection

city_with_road = nx.Graph(city_connection)

nx.draw(city_with_road, city_location, with_labels=True, node_size=30)

from collections import defaultdict

simple_connection_info_src = {
    '北京': ['太原', '瀋陽'],
    '太原': ['北京', '西安', '鄭州'],
    '蘭州': ['西安'],
    '鄭州': ['太原'],
    '西安': ['蘭州', '長沙'],
    '長沙': ['福州', '南寧'],
    '瀋陽': ['北京']
}

simple_connection_info = defaultdict(list)

simple_connection_info.update(simple_connection_info_src)

def bfs(graph, start):
    """
    breath first search
    """
    visited = [start]
    
    seen = set()
    
    while visited:
        froninter = visited.pop() #
        
        if froninter in seen: continue
        
        for successor in graph[froninter]:
            if successor in seen: continue
            print(successor)
            
            #visited = visited + [successor] # 我們每次擴展都擴展最新發現的點 -> depth first
            visited = [successor] + visited # 我們每次擴展都先考慮已經發現的 老的點 -> breath first
            
            # 所以說,這個擴展順序其實是決定了我們的深度優先還是廣度優先
    
        seen.add(froninter)
    
    return seen

number_grpah = defaultdict(list)

number_grpah.update({
    1: [2, 3],
    2: [1, 4], 
    3: [1, 5],
    4: [2, 6], 
    5: [3, 7],
    7: [5, 8]
})

bfs(number_grpah, 1)

simple_connection_info['西安']

nx.draw(nx.Graph(simple_connection_info), city_location, with_labels=True, node_size=10)

def search(start, destination, connection_grpah, sort_candidate):
    pathes = [[start]]
    
    visitied = set()
    
    while pathes: # if we find existing pathes
        path = pathes.pop(0)
        froninter = path[-1]
        
        if froninter in visitied: continue
            
        successors = connection_grpah[froninter]
        
        for city in successors:
            if city in path: continue  # eliminate loop
                
            new_path = path + [city]
            
            pathes.append(new_path)
            
            if city == destination: return new_path
        
        visitied.add(froninter)
        
        pathes = sort_candidate(pathes) # 我們可以加一個排序函數 對我們的搜索策略進行控制

def transfer_stations_first(pathes): 
    return sorted(pathes, key=len)

def transfer_as_much_possible(pathes):
    return sorted(pathes, key=len, reverse=True)

def shortest_path_first(pathes):
    
    if len(pathes) <= 1: return pathes
    
    def get_path_distnace(path):
        distance = 0
        for station in path[:-1]:
            distance += get_geo_distance(station, path[-1])
            
        return distance

    return sorted(pathes, key=get_path_distnace)

search('蘭州', '福州', simple_connection_info, sort_candidate=shortest_path_first)

def pretty_print(cities):
    print('🚗->'.join(cities))

pretty_print(search('北京', '福州', simple_connection_info))

Breath first search

Depth first search

pretty_print(search('北京', '南京', city_connection))

pretty_print(search('北京', '長沙', city_connection))

pretty_print(search('北京', '廣州', city_connection, sort_candidate=transfer_stations_first))

pretty_print(search('北京', '廣州', city_connection, sort_candidate=transfer_as_much_possible))

爬蟲 與 BFS

import requests

import requests
from lxml import etree

from config import config
from utils.common import get_header
from utils.db_utils import insert
from collections import Counter


class LaGou(object):

    def __init__(self, keyword, city, type):
        self.keyword = keyword
        self.city = city
        self.type = type
        self.baseurl = 'https://www.lagou.com/jobs/positionAjax.json'
        self.header = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'https://www.lagou.com/jobs/list_%E8%BF%90%E7%BB%B4?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
        }

    def spider(self):

        expanded_skills = []

        max_page = 10
        for i in range(1, max_page):
            s = requests.Session()
            s.get(
                url='https://www.lagou.com/jobs/list_運維?city=北京&cl=false&fromSearch=true&labelWords=&suginput=',
                headers=get_header(), timeout=3)
            cookie = s.cookies
            res = requests.post(self.baseurl, headers=self.header, data={'first': True, 'pn': i, 'kd': self.keyword},
                                params={'px': 'default', 'city': self.city, 'needAddtionalResult': 'false'},
                                cookies=cookie, timeout=3)
            text = res.json()
            all_data = text['content']['positionResult']['result']
            for data in all_data:
                s = requests.Session()
                s.get(
                    url='https://www.lagou.com/jobs/list_運維?city=北京&cl=false&fromSearch=true&labelWords=&suginput=',
                    headers=get_header(), timeout=3)
                cookie1 = s.cookies
                url = 'https://www.lagou.com/jobs/' + str(data.get('positionId')) + '.html'
                req1 = requests.get(url, headers=self.header, cookies=cookie1)
                req1.encoding = 'utf-8'
                html = etree.HTML(req1.text)
                detail = ''.join(html.xpath('//*[@class="job-detail"]//*/text()')).strip()
                if detail.isspace():
                    detail = ''.join(html.xpath('//*[@class="job-detail"]/text()')).strip()
                # print(detail)

                related_skills = data.get('skillLables')

                data_dict = {
                    "firstType": str(data.get('firstType')),
                    "secondType": str(data.get('secondType')),
                    "thirdType": str(data.get('thirdType')),
                    "city": str(data.get("city")),
                    "positionName": str(data.get('positionName')),
                    "district": str(data.get('district')),
                    "stationname": str(data.get('stationname')),
                    "jobNature": str(data.get('jobNature')),
                    "companyLabelList": str(data.get('companyLabelList')),
                    "industryField": str(data.get('industryField')),
                    "salary": str(data.get('salary')),
                    "companySize": str(data.get('companySize')),
                    "skillLables": str(related_skills),
                    "createTime": str(data.get('createTime')),
                    "companyFullName": str(data.get('companyFullName')),
                    "workYear": str(data.get('workYear')),
                    "education": str(data.get('education')),
                    "positionAdvantage": str(data.get('positionAdvantage')),
                    "url": str(url),
                    "detail": str(detail),
                    "type": str(self.type),
                    "latitude": str(data.get("latitude")),
                    "longitude": str(data.get("longitude")),
                    "keyword": str(self.keyword),
                }
                # print(data_dict)
                # time.sleep(random.randint(1, 5))

                expanded_skills += related_skills

                # print(related_skills)

                if not insert('jobs', **data_dict):
                    continue

        return [s.lower() for s in expanded_skills]


def lagou_worker(city):
    _, position, init_job = config()
    visited_jobs = set()
    while init_job:
        search_job = init_job.pop(0)

        print('We need to search {}, now search {}'.format(init_job, search_job))

        if search_job in visited_jobs:
            continue
        type = ''
        for k, v in position.items():
            if search_job in v:
                type = k

        new_expaned = LaGou(keyword=search_job, city=city,
                            type=type).spider()

        expaned_counter = Counter(new_expaned).most_common(n=5)

        new_jobs = [j for j, n in expaned_counter]

        init_job += new_jobs

        visited_jobs.add(search_job)


if __name__ == '__main__':

    init_job = ['人工智能', '測試', '運維', '交互設計', '數據產品經理', '原畫師', '動畫師', '區塊鏈', '產品經理', '用戶運營', '數據運營']

    visited_jobs = set()

    while init_job:
        search_job = init_job.pop(0)

        print('We need to search {}, now search {}'.format(init_job, search_job))

        if search_job in visited_jobs: continue

        new_expaned = LaGou(keyword=search_job, city='全國', type='產品線').spider()

        expaned_counter = Counter(new_expaned).most_common(n=5)

        new_jobs = [j for j, n in expaned_counter]

        init_job += new_jobs

        visited_jobs.add(search_job)
        print(search_job)

Machine Learning

from sklearn.datasets import load_boston

data = load_boston()

X, y = data['data'], data['target']

X[1]

y[1]

len(X[:, 0])

len(y)

%matplotlib inline

import matplotlib.pyplot as plt

def draw_rm_and_price():
    plt.scatter(X[:, 5], y)

draw_rm_and_price()

import random

def price(rm, k, b):
    """f(x) = k * x + b"""
    return k * rm + b  

X_rm = X[:, 5]
k = random.randint(-100, 100)
b = random.randint(-100, 100)
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]

draw_rm_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)

X_rm = X[:, 5]
k = random.randint(-100, 100)
b = random.randint(-100, 100)
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]

draw_rm_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)

y^ \hat{y}

list(y)

price_by_random_k_and_b

[1, 1, 1]
[2, 2, 2]

loss

loss=1n(yiyi^)2 loss = \frac{1}{n} \sum{(y_i - \hat{y_i})}^2

loss=1n(yi(kxi+bi))2 loss = \frac{1}{n} \sum{(y_i - (kx_i + b_i))}^2

lossk=2n(yi(kxi+bi))xi \frac{\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i - (kx_i + b_i))x_i

lossk=2n(yiyi^)xi \frac{\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i - \hat{y_i})x_i

lossb=2n(yiyi^) \frac{\partial{loss}}{\partial{b}} = -\frac{2}{n}\sum(y_i - \hat{y_i})

def loss(y, y_hat): # to evaluate the performance 
    return sum((y_i - y_hat_i)**2 for y_i, y_hat_i in zip(list(y), list(y_hat))) / len(list(y))

First-Method: Random generation: get best k and best b

trying_times = 2000

min_loss = float('inf')
best_k, best_b = None, None

for i in range(trying_times):
    k = random.random() * 200 - 100
    b = random.random() * 200 - 100
    price_by_random_k_and_b = [price(r, k, b) for r in X_rm]

    current_loss = loss(y, price_by_random_k_and_b)
    
    if current_loss < min_loss:
        min_loss = current_loss
        best_k, best_b = k, b
        print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss))

10 ** 0.5

X_rm = X[:, 5]
k = 15
b = -68
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]

draw_rm_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)

2nd-Method: Direction Adjusting

trying_times = 2000

min_loss = float('inf')

best_k = random.random() * 200 - 100
best_b = random.random() * 200 - 100

direction = [
    (+1, -1),  # first element: k's change direction, second element: b's change direction
    (+1, +1), 
    (-1, -1), 
    (-1, +1),
]

next_direction = random.choice(direction)

scalar = 0.1

update_time = 0

for i in range(trying_times):
    
    k_direction, b_direction = next_direction
    
    current_k, current_b = best_k + k_direction * scalar, best_b + b_direction * scalar
    
    price_by_k_and_b = [price(r, current_k, current_b) for r in X_rm]

    current_loss = loss(y, price_by_k_and_b)
    
    if current_loss < min_loss: # performance became better
        min_loss = current_loss
        best_k, best_b = current_k, current_b
        
        next_direction = next_direction
        update_time += 1
        
        if update_time % 10 == 0: 
            print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss))
    else:
        next_direction = random.choice(direction)

如果我們想得到更快的更新,在更短的時間內獲得更好的結果,我們需要一件事情:

找對改變的方向

如何找對改變的方向呢?

2nd-method: 監督讓他變化–> 監督學習

導數

def partial_k(x, y, y_hat):
    n = len(y)

    gradient = 0
    
    for x_i, y_i, y_hat_i in zip(list(x), list(y), list(y_hat)):
        gradient += (y_i - y_hat_i) * x_i
    
    return -2 / n * gradient


def partial_b(x, y, y_hat):
    n = len(y)

    gradient = 0
    
    for y_i, y_hat_i in zip(list(y), list(y_hat)):
        gradient += (y_i - y_hat_i)
    
    return -2 / n * gradient

from icecream import ic

trying_times = 2000

X, y = data['data'], data['target']

min_loss = float('inf') 

current_k = random.random() * 200 - 100
current_b = random.random() * 200 - 100

learning_rate = 1e-04


update_time = 0

for i in range(trying_times):
    
    price_by_k_and_b = [price(r, current_k, current_b) for r in X_rm]
    
    current_loss = loss(y, price_by_k_and_b)

    if current_loss < min_loss: # performance became better
        min_loss = current_loss
        
        if i % 50 == 0: 
            print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss))

    k_gradient = partial_k(X_rm, y, price_by_k_and_b)
    
    b_gradient = partial_b(X_rm, y, price_by_k_and_b)
    
    current_k = current_k + (-1 * k_gradient) * learning_rate

    current_b = current_b + (-1 * b_gradient) * learning_rate

y = 10

X_rm = X[:, 5]
k = 11.431551629413757
b = -49.52403584539048
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]

draw_rm_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)

loss([1, 1, 1], [2, 2, 3])

data['feature_names']
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章