問題描述
大家好,我們在用 networkx 顯示中文的時候,會發現不能顯示中文。
解決辦法
- 下載Github倉庫中的字體SimHei.ttf;
- 在 jupyter notebook 中執行
import matplotlib
print(matplotlib.__path__)
找到 matplotlib 的路徑,然後 cd 到這個路徑。 cd 到這個路徑之後,繼續 cd,cd 到 mpl-data/fonts/ttf 這個路徑。 然後把 DejaVuSans.ttf
這個文件換成我們剛剛下在的文件。
$ mv SimHei.ttf DejaVuSans.ttf
其中,用到的 ttf 字體。 我給大家傳到附件上了。
Previous Course
- What’s the language model? OOV problem?
- Syntax Tree. lambda
Out of Vocabulary(OOV)
Pr(w_o) = constant < Pr(w_1)
Language Model: Smooth
import random
"""adj* => adj* adj | adj null"""
def adj(): return random.choice('藍色的 | 好看的 | 小小的'.split('|')).split()[0]
def adj_star_2():
return random.choice([lambda : '', lambda : adj() + adj_star_2()])()
def adj_star():
return random.choice(['', ajd() + adj_star()])
lambda : adj() + adj_star_2()
numbers = [1, -2, -4, 1, 5, 6, 9, -11]
def absolute(number):
if number < 0: return -1 * number
else:
return number
absolute(-11)
def mod_5(number):
return number % 5
sorted(numbers, key=lambda x: x % 5)
mod_5_lambda = lambda x: x % 5
mod_5
mod_5_lambda(19)
mod_5(19)
adj_star_2()
## Search Policy
coordination_source = """
{name:'Lanzhou', geoCoord:[103.73, 36.03]},
{name:'Jiayuguan', geoCoord:[98.17, 39.47]},
{name:'Xining', geoCoord:[101.74, 36.56]},
{name:'Chengdu', geoCoord:[104.06, 30.67]},
{name:'Shijiazhuang', geoCoord:[114.48, 38.03]},
{name:'Lasa', geoCoord:[102.73, 25.04]},
{name:'Guiyang', geoCoord:[106.71, 26.57]},
{name:'Wuhan', geoCoord:[114.31, 30.52]},
{name:'Zhengzhou', geoCoord:[113.65, 34.76]},
{name:'Jinan', geoCoord:[117, 36.65]},
{name:'Nanjing', geoCoord:[118.78, 32.04]},
{name:'Hefei', geoCoord:[117.27, 31.86]},
{name:'Hangzhou', geoCoord:[120.19, 30.26]},
{name:'Nanchang', geoCoord:[115.89, 28.68]},
{name:'Fuzhou', geoCoord:[119.3, 26.08]},
{name:'Guangzhou', geoCoord:[113.23, 23.16]},
{name:'Changsha', geoCoord:[113, 28.21]},
//{name:'海口', geoCoord:[110.35, 20.02]},
{name:'Shengyang', geoCoord:[123.38, 41.8]},
{name:'Changchun', geoCoord:[125.35, 43.88]},
{name:'Haerbing', geoCoord:[126.63, 45.75]},
{name:'Taiyuan', geoCoord:[112.53, 37.87]},
{name:'Xian', geoCoord:[108.95, 34.27]},
//{name:'Taiwan', geoCoord:[121.30, 25.03]},
{name:'Beijing', geoCoord:[116.46, 39.92]},
{name:'Shanghai', geoCoord:[121.48, 31.22]},
{name:'Chongqing', geoCoord:[106.54, 29.59]},
{name:'Tianjing', geoCoord:[117.2, 39.13]},
{name:'Huhehaote', geoCoord:[111.65, 40.82]},
{name:'Nanning', geoCoord:[108.33, 22.84]},
//{name:'西藏', geoCoord:[91.11, 29.97]},
{name:'Yingchuan', geoCoord:[106.27, 38.47]},
{name:'Wulumuqi', geoCoord:[87.68, 43.77]},
{name:'Xianggang', geoCoord:[114.17, 22.28]},
{name:'Aomen', geoCoord:[113.54, 22.19]}
"""
city_location = {
'香港': (114.17, 22.28)
}
Input: String -> dict
test_string = "{name:'蘭州', geoCoord:[103.73, 36.03]},"
import re
pattern = re.compile(r"name:'(\w+)',\s+geoCoord:\[(\d+.\d+),\s(\d+.\d+)\]")
for line in coordination_source.split('\n'):
city_info = pattern.findall(line)
if not city_info: continue
# following: we find the city info
city, long, lat = city_info[0]
long, lat = float(long), float(lat)
city_location[city] = (long, lat)
city_location
import math
def geo_distance(origin, destination):
"""
Calculate the Haversine distance.
Parameters
----------
origin : tuple of float
(lat, long)
destination : tuple of float
(lat, long)
Returns
-------
distance_in_km : float
Examples
--------
>>> origin = (48.1372, 11.5756) # Munich
>>> destination = (52.5186, 13.4083) # Berlin
>>> round(distance(origin, destination), 1)
504.2
"""
lat1, lon1 = origin
lat2, lon2 = destination
radius = 6371 # km
dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
math.sin(dlon / 2) * math.sin(dlon / 2))
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
d = radius * c
return d
def get_geo_distance(city1, city2):
return geo_distance(city_location[city1], city_location[city2])
get_geo_distance('Shanghai', 'Hangzhou')
import networkx as nx
city_graph = nx.Graph()
city_graph.add_nodes_from(list(city_location.keys()))
%matplotlib inline
import matplotlib.pyplot as plt
city_location
nx.draw(city_graph, city_location, with_labels=True, node_size=30)
threshold = 300
from collections import defaultdict
city_connection = defaultdict(list)
for c1 in city_location:
for c2 in city_location:
if c1 == c2: continue
distance = get_geo_distance(c1, c2)
if distance < threshold:
city_connection[c1].append(c2)
city_connection[c2].append(c1)
city_connection
city_with_road = nx.Graph(city_connection)
nx.draw(city_with_road, city_location, with_labels=True, node_size=30)
from collections import defaultdict
simple_connection_info_src = {
'北京': ['太原', '瀋陽'],
'太原': ['北京', '西安', '鄭州'],
'蘭州': ['西安'],
'鄭州': ['太原'],
'西安': ['蘭州', '長沙'],
'長沙': ['福州', '南寧'],
'瀋陽': ['北京']
}
simple_connection_info = defaultdict(list)
simple_connection_info.update(simple_connection_info_src)
def bfs(graph, start):
"""
breath first search
"""
visited = [start]
seen = set()
while visited:
froninter = visited.pop() #
if froninter in seen: continue
for successor in graph[froninter]:
if successor in seen: continue
print(successor)
#visited = visited + [successor] # 我們每次擴展都擴展最新發現的點 -> depth first
visited = [successor] + visited # 我們每次擴展都先考慮已經發現的 老的點 -> breath first
# 所以說,這個擴展順序其實是決定了我們的深度優先還是廣度優先
seen.add(froninter)
return seen
number_grpah = defaultdict(list)
number_grpah.update({
1: [2, 3],
2: [1, 4],
3: [1, 5],
4: [2, 6],
5: [3, 7],
7: [5, 8]
})
bfs(number_grpah, 1)
simple_connection_info['西安']
nx.draw(nx.Graph(simple_connection_info), city_location, with_labels=True, node_size=10)
def search(start, destination, connection_grpah, sort_candidate):
pathes = [[start]]
visitied = set()
while pathes: # if we find existing pathes
path = pathes.pop(0)
froninter = path[-1]
if froninter in visitied: continue
successors = connection_grpah[froninter]
for city in successors:
if city in path: continue # eliminate loop
new_path = path + [city]
pathes.append(new_path)
if city == destination: return new_path
visitied.add(froninter)
pathes = sort_candidate(pathes) # 我們可以加一個排序函數 對我們的搜索策略進行控制
def transfer_stations_first(pathes):
return sorted(pathes, key=len)
def transfer_as_much_possible(pathes):
return sorted(pathes, key=len, reverse=True)
def shortest_path_first(pathes):
if len(pathes) <= 1: return pathes
def get_path_distnace(path):
distance = 0
for station in path[:-1]:
distance += get_geo_distance(station, path[-1])
return distance
return sorted(pathes, key=get_path_distnace)
search('蘭州', '福州', simple_connection_info, sort_candidate=shortest_path_first)
def pretty_print(cities):
print('🚗->'.join(cities))
pretty_print(search('北京', '福州', simple_connection_info))
Breath first search
Depth first search
pretty_print(search('北京', '南京', city_connection))
pretty_print(search('北京', '長沙', city_connection))
pretty_print(search('北京', '廣州', city_connection, sort_candidate=transfer_stations_first))
pretty_print(search('北京', '廣州', city_connection, sort_candidate=transfer_as_much_possible))
爬蟲 與 BFS
import requests
import requests
from lxml import etree
from config import config
from utils.common import get_header
from utils.db_utils import insert
from collections import Counter
class LaGou(object):
def __init__(self, keyword, city, type):
self.keyword = keyword
self.city = city
self.type = type
self.baseurl = 'https://www.lagou.com/jobs/positionAjax.json'
self.header = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_%E8%BF%90%E7%BB%B4?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
def spider(self):
expanded_skills = []
max_page = 10
for i in range(1, max_page):
s = requests.Session()
s.get(
url='https://www.lagou.com/jobs/list_運維?city=北京&cl=false&fromSearch=true&labelWords=&suginput=',
headers=get_header(), timeout=3)
cookie = s.cookies
res = requests.post(self.baseurl, headers=self.header, data={'first': True, 'pn': i, 'kd': self.keyword},
params={'px': 'default', 'city': self.city, 'needAddtionalResult': 'false'},
cookies=cookie, timeout=3)
text = res.json()
all_data = text['content']['positionResult']['result']
for data in all_data:
s = requests.Session()
s.get(
url='https://www.lagou.com/jobs/list_運維?city=北京&cl=false&fromSearch=true&labelWords=&suginput=',
headers=get_header(), timeout=3)
cookie1 = s.cookies
url = 'https://www.lagou.com/jobs/' + str(data.get('positionId')) + '.html'
req1 = requests.get(url, headers=self.header, cookies=cookie1)
req1.encoding = 'utf-8'
html = etree.HTML(req1.text)
detail = ''.join(html.xpath('//*[@class="job-detail"]//*/text()')).strip()
if detail.isspace():
detail = ''.join(html.xpath('//*[@class="job-detail"]/text()')).strip()
# print(detail)
related_skills = data.get('skillLables')
data_dict = {
"firstType": str(data.get('firstType')),
"secondType": str(data.get('secondType')),
"thirdType": str(data.get('thirdType')),
"city": str(data.get("city")),
"positionName": str(data.get('positionName')),
"district": str(data.get('district')),
"stationname": str(data.get('stationname')),
"jobNature": str(data.get('jobNature')),
"companyLabelList": str(data.get('companyLabelList')),
"industryField": str(data.get('industryField')),
"salary": str(data.get('salary')),
"companySize": str(data.get('companySize')),
"skillLables": str(related_skills),
"createTime": str(data.get('createTime')),
"companyFullName": str(data.get('companyFullName')),
"workYear": str(data.get('workYear')),
"education": str(data.get('education')),
"positionAdvantage": str(data.get('positionAdvantage')),
"url": str(url),
"detail": str(detail),
"type": str(self.type),
"latitude": str(data.get("latitude")),
"longitude": str(data.get("longitude")),
"keyword": str(self.keyword),
}
# print(data_dict)
# time.sleep(random.randint(1, 5))
expanded_skills += related_skills
# print(related_skills)
if not insert('jobs', **data_dict):
continue
return [s.lower() for s in expanded_skills]
def lagou_worker(city):
_, position, init_job = config()
visited_jobs = set()
while init_job:
search_job = init_job.pop(0)
print('We need to search {}, now search {}'.format(init_job, search_job))
if search_job in visited_jobs:
continue
type = ''
for k, v in position.items():
if search_job in v:
type = k
new_expaned = LaGou(keyword=search_job, city=city,
type=type).spider()
expaned_counter = Counter(new_expaned).most_common(n=5)
new_jobs = [j for j, n in expaned_counter]
init_job += new_jobs
visited_jobs.add(search_job)
if __name__ == '__main__':
init_job = ['人工智能', '測試', '運維', '交互設計', '數據產品經理', '原畫師', '動畫師', '區塊鏈', '產品經理', '用戶運營', '數據運營']
visited_jobs = set()
while init_job:
search_job = init_job.pop(0)
print('We need to search {}, now search {}'.format(init_job, search_job))
if search_job in visited_jobs: continue
new_expaned = LaGou(keyword=search_job, city='全國', type='產品線').spider()
expaned_counter = Counter(new_expaned).most_common(n=5)
new_jobs = [j for j, n in expaned_counter]
init_job += new_jobs
visited_jobs.add(search_job)
print(search_job)
Machine Learning
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X[1]
y[1]
len(X[:, 0])
len(y)
%matplotlib inline
import matplotlib.pyplot as plt
def draw_rm_and_price():
plt.scatter(X[:, 5], y)
draw_rm_and_price()
import random
def price(rm, k, b):
"""f(x) = k * x + b"""
return k * rm + b
X_rm = X[:, 5]
k = random.randint(-100, 100)
b = random.randint(-100, 100)
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
draw_rm_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
X_rm = X[:, 5]
k = random.randint(-100, 100)
b = random.randint(-100, 100)
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
draw_rm_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
list(y)
price_by_random_k_and_b
[1, 1, 1]
[2, 2, 2]
loss
def loss(y, y_hat): # to evaluate the performance
return sum((y_i - y_hat_i)**2 for y_i, y_hat_i in zip(list(y), list(y_hat))) / len(list(y))
First-Method: Random generation: get best k and best b
trying_times = 2000
min_loss = float('inf')
best_k, best_b = None, None
for i in range(trying_times):
k = random.random() * 200 - 100
b = random.random() * 200 - 100
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
current_loss = loss(y, price_by_random_k_and_b)
if current_loss < min_loss:
min_loss = current_loss
best_k, best_b = k, b
print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss))
10 ** 0.5
X_rm = X[:, 5]
k = 15
b = -68
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
draw_rm_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
2nd-Method: Direction Adjusting
trying_times = 2000
min_loss = float('inf')
best_k = random.random() * 200 - 100
best_b = random.random() * 200 - 100
direction = [
(+1, -1), # first element: k's change direction, second element: b's change direction
(+1, +1),
(-1, -1),
(-1, +1),
]
next_direction = random.choice(direction)
scalar = 0.1
update_time = 0
for i in range(trying_times):
k_direction, b_direction = next_direction
current_k, current_b = best_k + k_direction * scalar, best_b + b_direction * scalar
price_by_k_and_b = [price(r, current_k, current_b) for r in X_rm]
current_loss = loss(y, price_by_k_and_b)
if current_loss < min_loss: # performance became better
min_loss = current_loss
best_k, best_b = current_k, current_b
next_direction = next_direction
update_time += 1
if update_time % 10 == 0:
print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss))
else:
next_direction = random.choice(direction)
如果我們想得到更快的更新,在更短的時間內獲得更好的結果,我們需要一件事情:
找對改變的方向
如何找對改變的方向呢?
2nd-method: 監督讓他變化–> 監督學習
導數
def partial_k(x, y, y_hat):
n = len(y)
gradient = 0
for x_i, y_i, y_hat_i in zip(list(x), list(y), list(y_hat)):
gradient += (y_i - y_hat_i) * x_i
return -2 / n * gradient
def partial_b(x, y, y_hat):
n = len(y)
gradient = 0
for y_i, y_hat_i in zip(list(y), list(y_hat)):
gradient += (y_i - y_hat_i)
return -2 / n * gradient
from icecream import ic
trying_times = 2000
X, y = data['data'], data['target']
min_loss = float('inf')
current_k = random.random() * 200 - 100
current_b = random.random() * 200 - 100
learning_rate = 1e-04
update_time = 0
for i in range(trying_times):
price_by_k_and_b = [price(r, current_k, current_b) for r in X_rm]
current_loss = loss(y, price_by_k_and_b)
if current_loss < min_loss: # performance became better
min_loss = current_loss
if i % 50 == 0:
print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss))
k_gradient = partial_k(X_rm, y, price_by_k_and_b)
b_gradient = partial_b(X_rm, y, price_by_k_and_b)
current_k = current_k + (-1 * k_gradient) * learning_rate
current_b = current_b + (-1 * b_gradient) * learning_rate
y = 10
X_rm = X[:, 5]
k = 11.431551629413757
b = -49.52403584539048
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
draw_rm_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
loss([1, 1, 1], [2, 2, 3])
data['feature_names']