小劉總--機器學習(深度優先算法)

Lesson-02

  • 深度優先與廣度優先
  • 爬蟲的原理
  • 機器學習的原理

OOV ?

Out of Vocabulary

Lambda

import random

"""adj* => adj* adj | adj null"""

def adj():  return random.choice('藍色的 | 好看的 | 小小的'.split('|')).split()[0]

give_adjs = lambda : adj() + adj_star_2()
give_null = lambda : ''
def adj_star_2():
    return random.choice([give_null, give_adjs])()
    #return random.choice([lambda : '', lambda : adj() + adj_star_2()])()

def adj_star(): # infite loop
    return random.choice(['', adj() + adj_star()])

adj_star_2()

def simple_func(x, y): 
    return x + y

results = []


for e in [1, 3, 5, 7, 9, 10]:
    results.append(operate(e))

results

#def operate(e): return e * 2 + 2 - 9

for i in map(lambda x: x * 2 + 2 - 9, [1, 3, 5, 7, 9, 10]): 
    print(i)

operator_2 = lambda x: x * 2 + 2 - 9
#def operate(e): return e * 2 + 2 - 9

def absolutly(x): 
    if x < 0: return -1 * x
    else:
        return x

def mod_3(x): return x % 3

mod_3(13)

sorted([1, 3, 4, 1, 5, 1, 1, 4, 9, -1, 9, -19, 1], key=mod_3)

sorted([1, 3, 4, 1, 5, 1, 1, 4, 9, -1, 9, -19, 1], key=lambda x: x % 3)

Search Policy

coordination_source = """
{name:'Lanzhou', geoCoord:[103.73, 36.03]},
{name:'Jiayuguan', geoCoord:[98.17, 39.47]},
{name:'Xining', geoCoord:[101.74, 36.56]},
{name:'Chengdu', geoCoord:[104.06, 30.67]},
{name:'Shijiazhuang', geoCoord:[114.48, 38.03]},
{name:'Lasa', geoCoord:[102.73, 25.04]},
{name:'Guiyang', geoCoord:[106.71, 26.57]},
{name:'Wuhan', geoCoord:[114.31, 30.52]},
{name:'Zhengzhou', geoCoord:[113.65, 34.76]},
{name:'Jinan', geoCoord:[117, 36.65]},
{name:'Nanjing', geoCoord:[118.78, 32.04]},
{name:'Hefei', geoCoord:[117.27, 31.86]},
{name:'Hangzhou', geoCoord:[120.19, 30.26]},
{name:'Nanchang', geoCoord:[115.89, 28.68]},
{name:'Fuzhou', geoCoord:[119.3, 26.08]},
{name:'Guangzhou', geoCoord:[113.23, 23.16]},
{name:'Changsha', geoCoord:[113, 28.21]},
//{name:'海口', geoCoord:[110.35, 20.02]},
{name:'Shengyang', geoCoord:[123.38, 41.8]},
{name:'Changchun', geoCoord:[125.35, 43.88]},
{name:'Haerbing', geoCoord:[126.63, 45.75]},
{name:'Taiyuan', geoCoord:[112.53, 37.87]},
{name:'Xian', geoCoord:[108.95, 34.27]},
//{name:'Taiwan', geoCoord:[121.30, 25.03]},
{name:'Beijing', geoCoord:[116.46, 39.92]},
{name:'Shanghai', geoCoord:[121.48, 31.22]},
{name:'Chongqing', geoCoord:[106.54, 29.59]},
{name:'Tianjing', geoCoord:[117.2, 39.13]},
{name:'Huhehaote', geoCoord:[111.65, 40.82]},
{name:'Nanning', geoCoord:[108.33, 22.84]},
//{name:'西藏', geoCoord:[91.11, 29.97]},
{name:'Yingchuan', geoCoord:[106.27, 38.47]},
{name:'Wulumuqi', geoCoord:[87.68, 43.77]},
{name:'Xianggang', geoCoord:[114.17, 22.28]},
{name:'Aomen', geoCoord:[113.54, 22.19]}
"""

geo_pattern = "{name:'(\w+)',\s+geoCoord:\[(\d+\.\d+),\s+(\d+\.\d+)\]}"

city_location = {}

for city, x, y in re.findall(geo_pattern, coordination_source):
    city_location[city] = (float(x), float(y))

city_location

cities = {c : [] for c in city_location}

nx.draw(nx.Graph(cities),city_location, with_labels=True, font_size=10, node_size=10)

import math

def geo_distance(origin, destination):
    """
    Calculate the Haversine distance.

    Parameters
    ----------
    origin : tuple of float
        (lat#緯度-北緯南緯-y, long#經度-東經西經-x)
    destination : tuple of float
        (lat, long)

    Returns
    -------
    distance_in_km : float

    Examples
    --------
    >>> origin = (48.1372, 11.5756)  # Munich
    >>> destination = (52.5186, 13.4083)  # Berlin
    >>> round(distance(origin, destination), 1)
    504.2
    """
    #lat1, lon1 = origin
    lon1, lat1 = origin
    
    #lat2, lon2 = destination
    lon2, lat2 = destination
    radius = 6371  # km

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c

    return d



city_location['Beijing']

city_location['Tianjing']

city_location['Shengyang']

city_location['Shanghai']

geo_distance((116.46, 39.92), (117.2, 39.13))

平面上: P1: [123.38, 41.8] P2: [121.48, 31.22]

P1: [41.8, 123.38] P2: [31.22, 121.48]

geo_distance((123.38, 41.8), (121.48, 31.22))

geo_distance((41.8, 123.38), (31.22, 121.48))

def get_city_distance(city1, city2):
    return geo_distance(city_location[city1], city_location[city2])    

get_city_distance('Beijing', 'Shanghai')

get_city_distance('Hangzhou', 'Shanghai')

from collections import defaultdict

threshold = 700

#city_connection = {c : [] for c in city_location}
city_connection = defaultdict(list)

for city1 in city_location:
    for city2 in city_location:
        if city1 is city2: continue
        
        distance = get_city_distance(city1, city2)
        
        if distance < threshold:
            city_connection[city1].append(city2)  

city_connection

## 正則表達式

import re

some_text = 'this is a test text, I want some more tests; And I have some tested cases'

'test', 'tests', 'testedededed'

+ 一個或者多個字符

* 指的是0個或者多個字符

re == > regular expresson

pattern = 'test\w*' 
pattern_2 = 'test\w+' 

re.findall(pattern_2, some_text)
#re.findall(pattern, some_text)

with_numbers_text = """
    電話:18910451234 年齡:32 性別:男 ID:90
    電話:18910451234 年齡:32 性別:男
    電話:18910451234 年齡:32 性別:男
    電話:18910451234 年齡:32 性別:男
    電話:189104513123234 年齡:32 性別:男 ID: 312
    電話:18910451234 年齡:32 性別:男
    電話:18989810451234 年齡:32 性別:男 STU_OD: 312
    電話:01018910451234 年齡:3 性別:男
    電話:18910451234 年齡:32 性別:男
    電話:18910451234 年齡:32 性別:男
    電話:18910451234 年齡:32 性別:男
"""

with_constraint_num_pattern = '電話:(\d+) 年齡:(\d+)' 

re.findall(with_constraint_num_pattern, with_numbers_text)

import networkx as nx

relatinship = {
    'Zhangsan': [],
    'Li': [],
    'Huang': []
}

simple_graph = nx.Graph(relatinship)

%matplotlib inline

locations = {
    'Zhangsan': (0, 0),
    'Li': (1, 4),
    'Wang': (9, 0),
    'Liu': (19, 1),
    'Gao': (10, 10), 
    'Huang': (11, 19),
    'Hu': (18, -10)
}

nx.draw(simple_graph, locations, with_labels=True)

city_graph = nx.Graph(city_connection)

nx.draw(city_graph, city_location, with_labels=True)

五子棋 -->

象棋

從鄭州到嘉峪關爲例

廣度優先搜索

def bfs(graph, start, destination):   
#    need_visit = [start]    
    pathes = [[start]]
    
    seen = set()
    
    visited_cities = []
    
#    while need_visit:
    while pathes:
        path = pathes.pop(0)
        
#        froniter = need_visit.pop(0)
        froniter = path[-1]
        
        if froniter in seen: continue
        
        for successor in graph[froniter]:
            if successor in path: continue
            
            new_path = path + [successor]
            
            if successor == destination: # 找到目的地的時候
                visited_cities.append(froniter)
                visited_cities.append(successor)
                return new_path, visited_cities, pathes # 返回了
                
            pathes.append(new_path)
        
        seen.add(froniter)
        
        visited_cities.append(froniter)
    
    return seen, visited_cities, pathes
            

def best_fs(graph, start, destination, strategy):   
#    need_visit = [start]    
    pathes = [[start]]
    
    seen = set()
    
    visited_cities = []
    
#    while need_visit:
    while pathes:
        path = pathes.pop(0)
        
#        froniter = need_visit.pop(0)
        froniter = path[-1]
        
        if froniter in seen: continue
        
        for successor in graph[froniter]:
            if successor in path: continue
            
            new_path = path + [successor]
            
            if successor == destination: # 找到目的地的時候
                visited_cities.append(froniter)
                visited_cities.append(successor)
                return new_path, visited_cities, pathes # 返回了
                
            pathes.append(new_path)
        
        seen.add(froniter)
        
        visited_cities.append(froniter)
        
        pathes = sorted(pathes, key=strategy)
    
    return seen, visited_cities, pathes
            

p, v, ps = best_fs(city_graph, 'Haerbing', 'Hefei', get_total_length)

pretty_print(p)

p, v, ps = best_fs(city_graph, 'Haerbing', 'Hefei', lambda x: -len(x))

pretty_print(p)
get_total_length(p)

p, v, ps = bfs(city_graph, 'Haerbing', 'Hefei')

pretty_print(p)

get_total_length(p)

def pretty_print(cities):
    print('🚘->'.join(cities))

p, v, ps = bfs(city_graph, 'Haerbing', 'Hefei')

pretty_print(p)

def get_total_length(path):
    distance = 0 
    for i, city in enumerate(path[:-1]):
        current, next_c = city, path[i+1]
        distance += get_city_distance(current, next_c)
    
    return distance

depth_first_search(city_graph, 'Beijing', 'Hefei')

深度優先算法

def depth_first_search(graph, start, destination):   
#    need_visit = [start]    
    pathes = [[start]]
    
    seen = set()
    
    visited_cities = []
    
#    while need_visit:
    while pathes:
  #      path = pathes.pop(0) # 把以前的老路先走完
        path = pathes.pop(-1) # 取最新的路徑
        
#        froniter = need_visit.pop(0)
        froniter = path[-1]
        
        if froniter in seen: continue
        
        for successor in graph[froniter]:
            if successor in path: continue
            
            new_path = path + [successor]
            
            if successor == destination: # 找到目的地的時候
                visited_cities.append(froniter)
                visited_cities.append(successor)
                return new_path, visited_cities, pathes # 返回了
                
            pathes.append(new_path)
        
        seen.add(froniter)
        
        visited_cities.append(froniter)
    
    return seen, visited_cities, pathes
            

path, visited_cities, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan')

path

pathes

color_map = ['green' for _ in city_graph]

def visited_procedure(graph, postion, visited_order, step):

    changed = visited_order[:step] if step else visited_order
    
    color_map = ['green' if c in changed else 'red' for c in graph]
    
    nx.draw(graph, postion, node_color=color_map, with_labels=True)

path, visited_cities_bfs, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan')
visited_procedure(city_graph, city_location, visited_cities_bfs, step=20)

path, visited_cities_dfs, pathes = depth_first_search(city_connection, 'Zhengzhou', 'Jiayuguan')

visited_procedure(city_graph, city_location, visited_cities_dfs, step=18)

path

找到了從鄭州到嘉峪關的路

path, visited_cities, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan')

visited_procedure(city_graph, city_location, visited_cities, step=None)

爬蟲、搜索引擎與BFS,DFS的關係

如果你要做一個爬蟲程序,你應該用BFS還是DFS?

import requests

https://github.com/Autobicycle-Never-Blocking/Beloved-Little-Moto/blob/master/crawl.py

作業:從https://ditie.mapbar.com/beijing/開始

  • 爬蟲
  • 正則表達式
  • 數據預處理
  • BFS
  • Best-First-Search
    -> 實現北京地鐵的換乘方案
url = 'https://ditie.mapbar.com/beijing/'

response = requests.get(url)

pattern = """<a href="(http://ditie\.mapbar\.com/beijing/\w+/)" style="background-color:#\w+" target="_blank">(\w+)</a>"""

re.findall(pattern, response.text)

a_station = 'https://ditie.mapbar.com/beijing/line_ditiebatongxian/'

response = requests.get(a_station)

pattern_s = '<li\s*.*><a class="cl\-station" href="http://ditie\.mapbar\.com/beijing/station_\w+/" target="_blank" title="\w+">(\w+)</a></li>'

re.findall(pattern_s, response.text)

{
    '八通線'[],
    '一號線'[],
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章