Intro to Computer Science（closed）

Intro to Computer Science

標(biāo)簽（空格分隔）： Udacity

[TOC]

Lesson2 problem set

find last

# Define a procedure, find_last, that takes as input
# two strings, a search string and a target string,
# and returns the last position in the search string
# where the target string appears, or -1 if there
# are no occurrences.
#
# Example: find_last('aaaa', 'a') returns 3

# Make sure your procedure has a return statement.

#關(guān)于find()的復(fù)數(shù)用法，s.find(t, -2),意思是從s的倒數(shù)第二個字符開始找

def find_last(s, t):    # s means search, t means target
    last_pos = -1
    while True:
        pos = s.find(t, last_pos+1) //這里的+1非常重要骑丸，最關(guān)鍵的部分
        if pos == -1:
            return last_pos
        last_pos = pos
  


print find_last('aaaa', 'a')
#>>> 3

print find_last('aaaaa', 'aa')
#>>> 3

print find_last('aaaa', 'b')
#>>> -1

#print find_last("111111111", "1")
#>>> 8

#print find_last("222222222", "")
#>>> 9

#print find_last("", "3")
#>>> -1

#print find_last("", "")
#>>> 0

Lesson 2.5:how to solve problems

how old

# By Websten from forums
#
# Given your birthday and the current date, calculate your age in days. 
# Account for leap days. 
#
# Assume that the birthday and current date are correct dates (and no 
# time travel). 

#我寫的這個漏洞太多，錯的
def daysBetweenDates(year1, month1, day1, year2, month2, day2):
    ##
    # Your code here.
    ##
    year = year2 - year1
    if month2 < month1:
        month = month2 + 12 - month1
        year -= 1
    else:
        month = month2 - month1
    if day2 < day1:
        day = day2+30 - day1
        month -= 1
    else:
        day = day2 - day1
    return year*365 + month*30 + day + 1

# Test routine

def test():
    test_cases = [((2012,1,1,2012,2,28), 58), 
                  ((2012,1,1,2012,3,1), 60),
                  ((2011,6,30,2012,6,30), 366),
                  ((2011,1,1,2012,8,8), 585 ),
                  ((1900,1,1,1999,12,31), 36523)]
    for (args, answer) in test_cases:
        result = daysBetweenDates(*args)
        if result != answer:
            print "Test with data:", args, "failed"
        else:
            print "Test case passed!"

test()

先寫一個簡單的例子许师，nextday,假設(shè)每月只有30天

def nextDay(year, month, day):
    if day < 30:
        return year, month, day+1
    else:
        if month < 12:
            return year, month+1, 1
        else:
        return year+1, 1, 1

連接兩個list

我的錯誤示例

def union(a, b):
    for e in b:
        if e in a:
            pass
        else:
            a + e
            
# To test, uncomment all lines 
# below except those beginning with >>>.

a = [1,2,3]
b = [2,4,6]
union(a,b)
print a 
#>>> [1,2,3,4,6]
#print b
#>>> [2,4,6]

報錯說 list can't connect with list，看來這個+ operation不能用

正確方法，用append()

def union(a, b):
    for e in b:
        if e not in a: 
            a.append(e)

lesson 3 problem set

Max Pages solution

這個是無限版crawl的例子

def crawl_web(see, max_pages):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            union(tocrawl, get_all_links(get_page(page))) 
            crawled.append(page)
    return crawled

這個是針對max_pages做出調(diào)整的代碼

def crawl_web(see, max_pages):
    tocrawl = [seed]
    crawled = []  //len(crawled) is length of crawled
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled and len(crawled) < max_pages: // 只要爬過的crawled list數(shù)目小于max_pages即可
            union(tocrawl, get_all_links(get_page(page))) 
            crawled.append(page)
    return crawled

Max Depth Solution

這個是無限版crawl的例子

def crawl_web(see, max_pages):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            union(tocrawl, get_all_links(get_page(page))) 
            crawled.append(page)
    return crawled

根據(jù)max_pages做出相應(yīng)調(diào)整的例子

def crawl_web(seed,max_depth):    
    tocrawl = [seed]
    crawled = []
    next_depth = []  // 用來記載下一個depth里需要crawl的頁面
    depth = 0       //depth一開始的默認(rèn)值是0
    while tocrawl and depth <= max_depth:   //當(dāng)tocrawl為零财岔，即沒有需要crawl的網(wǎng)頁時結(jié)束避诽，當(dāng)depth等于max_depth時也結(jié)束
        page = tocrawl.pop()
        if page not in crawled:
            union(next_depth, get_all_links(get_page(page)))    //把tocrawl變成了next_depth, 將下一個depth里的page和當(dāng)前page合并
            crawled.append(page)
        if not tocrawl:  // 關(guān)鍵龟虎，只有在當(dāng)前層crawl完后才會運(yùn)行這段代碼，進(jìn)入下一層
            tocrawl, next_depth = next_depth, []  //把下一層的頁面賦給tocrawl沙庐，將下一層的頁面清空
            depth = depth + 1 // 此時tocrawl里全是下一層的頁面鲤妥，故depth+1， 
    return crawled

無注釋版

def crawl_web(seed,max_depth):    
    tocrawl = [seed]
    crawled = []
    next_depth = []
    depth = 0
    while tocrawl and depth <= max_depth:
        page = tocrawl.pop()
        if page not in crawled:
            union(next_depth, get_all_links(get_page(page)))
            crawled.append(page)
        if not tocrawl:
            tocrawl, next_depth = next_depth, []
            depth = depth + 1
    return crawled

problem set : sudoku

問題描述

# A valid sudoku square satisfies these
# two properties:

#   1. Each column of the square contains
#       each of the whole numbers from 1 to n exactly once.

#   2. Each row of the square contains each
#       of the whole numbers from 1 to n exactly once.

# You may assume the the input is square and contains at
# least one row and column.

我的答案

def twice(p, m):    #檢查m在p中是否有兩個以上的數(shù)字
    n = 0
    for i in p:
        if i == m
        n++
    if n == 1:
        return True
    else:
        return False
    
    
def check_sudoku(sudoku):
    for i in sudoku:    #檢查是否是方陣n*n
        if len(i) != len(sudoku):
            return False
    for n in sudoku:     #檢查數(shù)字是否是1到9內(nèi)的數(shù)字拱雏，字母也不行棉安，分?jǐn)?shù)也不行
        for m in n:
            if m > num_list and m not in [1,2,3,4,5,6,7,8,9]:
                return False
    for n in sudoku:    #檢查行與列是否有相同數(shù)字
        // row = sudoku.index(n)
        for m in n:     # 檢查每行是否有相同的數(shù)字
            if twice(n, m):
                return False
            column = n.index(m)   #得到m的index
            # 下面四行把m所在列的數(shù)字放到m_column里
            m_column = []
            i = 0
            while i <= len(sodoku):   
                m_column.append(sudoku[i][column])
            #檢查每列是否有相同數(shù)字
            if twice(m_column, m):
                return False
     return True

答案
思路是不管每個digit是什么，不從數(shù)組中取值铸抑，直接拿1到9去數(shù)組里找

def check_sudoku(p):
    n = len(p) # Extract size of grid
    digit = 1 # strat with 1,從1開始逐個與每個digit比較
    while digit <= n: #Go through each digit
        i = 0
        while i < n: #Go through each row and column        
            row_count = 0
            col_count = 0
            j = 0
            while j < n: #for each entry in ith row/column
                if p[i][j] == digit: #check row count
                    row_count = row_count + 1
                if p[j][i] == digit:
                    col_count = col_count + 1
                j = j + 1
            if row_count != 1 or col_count != 1:
                return False
            i = i + 1 #newt row/column
            digit = digit + 1 #next digit
    return True #Nothing was wrong!

Symmetric Square

zip()函數(shù)能把矩陣變成逆矩陣

q = [[0, 1, 2], 
     [-1, 0, 3], 
     [-2, -3, 0]]

print zip(*q)

[(0, -1, -2), 
 (1, 0, -3), 
 (2, 3, 0)]

mutated = [list(i) for i in(zip(*q))]
print mutated

[[0, -1, -2], 
 [1, 0, -3], 
 [2, 3, 0]]

也不知道list起什么作用贡耽，能把()變成[]
全部函數(shù)

def symmetric(q):
    original = q
    mutated = [list(i) for i in(zip(*q))]
    if original==mutated:
        return True
    return False

Lesson 4 Responding to Queries

Add to Index

要求筆記里也有寫

# Define a procedure, add_to_index,
# that takes 3 inputs:

# - an index: [[<keyword>,[<url>,...]],...]
# - a keyword: String
# - a url: String

# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.

# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]

index = []

def add_to_index(index_0,keyword,url):
       
        
add_to_index(index,'udacity','http://udacity.com')
add_to_index(index,'computing','http://acm.org')
add_to_index(index,'udacity','http://npr.org')
print index
#>>> [['udacity', ['http://udacity.com', 'http://npr.org']], 
#>>> ['computing', ['http://acm.org']]]

我的代碼

def check_keyword(index, keyword):   #檢查keyword是否在index中
    for i in index:
        if i[0] == keyword:
            return True
        else:
            return False

#把index中keyword的第一層位置返回
def return_keyword_index(index, keyword):  
    for i in index:
        if[0] == keyword:
            return index.index(i)
        

def add_to_index(index,keyword,url):
    if not check_keyword(index, keyword):
        index.append([keyword, [url]])
    else:
        i = return_keyword_index(index, keyword)
        index[i][1].append('url')

The answer

def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    index.append([keyword, [url]])

lookup

返回keyword的所有url，沒有對應(yīng)keyword的話返回empty list

# Define a procedure, lookup,
# that takes two inputs:

# - an index
# - keyword

# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.

index = [['udacity', ['http://udacity.com', 'http://npr.org']],
         ['computing', ['http://acm.org']]]
#my code
def lookup(index,keyword):
    for i in index:
        if i[0] == keyword:
           return i[1]
    return []

Add Page to Index Solution

把content(比如說一篇文章）分成每個單詞鹊汛，把單詞作為keyword,把url放到這個keyword之后的url list蒲赂。

# Define a procedure, add_page_to_index,
# that takes three inputs:

#   - index
#   - url (String)
#   - content (String)

# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.

index = []


def add_to_index(index,keyword,url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    index.append([keyword,[url]])

#待定義的代碼
def add_page_to_index(index,url,content):
    words = content.split()
    for word in words:
        add_to_index(index,word,url)

測試的結(jié)果

add_page_to_index(index,'fake.text',"This is a test")
print index

[['This', ['fake.text']], ['is', ['fake.text']], ['a', ['fake.text']], ['test', ['fake.text']]]

Finish the web crawler

把之前的crawl_web針對index功能做一些修改，實現(xiàn)最終的crawled

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = [] #update
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page) #update, the page is the url
            add_page_to_index(index,page,content) # update
            union(tocrawl,get_all_links(content))
            crawled.append(page)
    return index

3 The Internet

get_page()函數(shù)的代碼

def get_page(url):
    try:
        import urllib
        return urllib.urlopen(url).read()
    except:
        return ""

Lesson 4 problem set

Better Splitting

 # The built-in <string>.split() procedure works
# okay, but fails to find all the words on a page
# because it only uses whitespace to split the
# string. To do better, we should also use punctuation
# marks to split the page into words.

# Define a procedure, split_string, that takes two
# inputs: the string to split and a string containing
# all of the characters considered separators. The
# procedure should return a list of strings that break
# the source string up by the characters in the
# splitlist.


def split_string(source,splitlist):
    output = []
    atsplit = True
    for char in source:
        if char in splitlist:
            atsplit = True
        else:
            if atsplit:
                output.append(char)
                atsplit = False
            else:
                output[-1] = output[-1] + char
    return output

Improving the Index Solution

# The current index includes a url in the list of urls
# for a keyword multiple times if the keyword appears
# on that page more than once.

# It might be better to only include the same url
# once in the url list for a keyword, even if it appears
# many times.

# Modify add_to_index so that a given url is only
# included once in the url list for a keyword,
# no matter how many times that keyword appears.

def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == url:
            if url not in entry[1]: # 本問題只需要加這一行即可 
   
             entry[1].append(keyword)
            return
    # not found, add new keyword to index
    index.append([url, [keyword]])


def get_page(url):
    try:
        if url == "http://www.udacity.com/cs101x/index.html":
            return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a >
learn to crawl</a> before you try to
<a >walk</a> or
<a >fly</a>.</p></body>
</html>'''

        elif url == "http://www.udacity.com/cs101x/crawling.html":
            return '''<html> <body> I have not learned to crawl yet, but I am
quite good at  <a >kicking</a>.
</body> </html>'''

        elif url == "http://www.udacity.com/cs101x/walking.html":
            return '''<html> <body> I cant get enough
<a 

        elif url == "http://www.udacity.com/cs101x/flying.html":
            return '''<html>
<body>The magic words are Squeamish Ossifrage!</body></html>'''
    except:
        return ""
    return ""

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            union(tocrawl, get_all_links(content))
            crawled.append(page)
    return index

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)

def lookup(index, keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return None

#index = crawl_web("http://www.udacity.com/cs101x/index.html")
#print lookup(index,"is")
#>>> ['http://www.udacity.com/cs101x/index.html']

Counting Clicks Solution

記載每次搜索某一url的次數(shù)
每用一次lookup()函數(shù)刁憋，就是一次對keyword的搜索滥嘴，如果能找到url，就說明這個URL的count數(shù)+1至耻，用record_user_click(index, keyword, url) 將其+1. 這要改變原有的數(shù)據(jù)結(jié)構(gòu)氏涩，在每一個url的后面接一個count記載搜索次數(shù)届囚。index = [keyword, [[url, count], [url, count] ......]

#urls = [[url, count],[url,count]...]

def record_user_click(index, keyword, url):
    urls = lookup(index, keyword)
    if urls:
        for entry in urls:
            if entry[0] == url:
                entry[1] = entry[1]+1

def add_to_index(index, keyword, url):
    # format of index: [[keyword, [[url, count], [url, count],..]],...]
    for entry in index:
        if entry[0] == keyword:
            for urls in entry[1]:
                if urls[0] == url:
                    return
            entry[1].append([url,0])
            return
    # not found, add new keyword to index
    index.append([keyword, [[url,0]]])

time_execution

import time #this is a Python library

def time_execution(code):
   start = time.clock()  # start the clock
   result = eval(code)  # evaluate any string as if it is a Python command
   run_time = time.clock() - start  # find difference in start and end time
   return result, run_time  # return the result of the code and time taken

eval()里面必須用string, for example:time_execution('add_to_index(1,2,3)')

Lesson 6

6.8 implemanting URank

學(xué)的page rank之后，修改之前web_crawl()的代碼是尖，并返回graph structure.
之前的web_crawl():

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            union(tocrawl, get_all_links(content))
            crawled.append(page)
    return index

修改后的：

def crawl_web(seed): # returns index, graph of outlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>:[list of pages it links to]
    index = {} 
    while tocrawl: 
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content） #得到content里所有的links
            
            #Insert Code Here
            graph[page] = outlinks
            
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph

所有的代碼：

# Modify the crawl_web procedure so that instead of just returning the 
# index, it returns an index and a graph. The graph should be a 
# Dictionary where the key:value entries are:

#  url: [list of pages url links to] 


def crawl_web(seed): # returns index, graph of outlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>:[list of pages it links to]
    index = {} 
    while tocrawl: 
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            
            #Insert Code Here
            if page not in graph:
                graph[page] = outlinks
            
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph


cache = {
   'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipes:
<ul>
<li> <a >Hummus Recipe</a>
<li> <a s Best Hummus</a>
<li> <a s Hummus Recipe</a>
</ul>

For more expert opinions, check out the 
<a >Nickel Chef</a> 
and <a >Zinc Chef</a>.
</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from 
<a >the Nickel Chef</a>.
</p>
<p>
For great hummus, try 
<a >this recipe</a>.

</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a >
best Hummus recipe!
</a>

</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>

<ol>
<li> Open a can of garbanzo beans.
<li> Crush them in a blender.
<li> Add 3 tablespoons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>

</body>
</html>

""", 
   'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>

<ol>
<li> Kidnap the <a >Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>

</body>
</html>

""", 
   'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>

<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>

</body>
</html>




""", 
}

def get_page(url):
    if url in cache:
        return cache[url]
    else:
        return None
    
def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links


def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)
        
def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None



index , graph = crawl_web('http://udacity.com/cs101x/urank/index.html') 

if 'http://udacity.com/cs101x/urank/index.html' in graph:
    print g?raph['http://udacity.com/cs101x/urank/index.html']
#>>> ['http://udacity.com/cs101x/urank/hummus.html',
#'http://udacity.com/cs101x/urank/arsenic.html',
#'http://udacity.com/cs101x/urank/kathleen.html',
#'http://udacity.com/cs101x/urank/nickel.html',
#'http://udacity.com/cs101x/urank/zinc.html']

6.8.5 完整的page rank code

#Finishing the page ranking algorithm.

def compute_ranks(graph):
    d = 0.8 # damping factor
    numloops = 10
    
    ranks = {}
    npages = len(graph)
    for page in graph:
        ranks[page] = 1.0 / npages
    
    for i in range(0, numloops):
        newranks = {}
        for page in graph:
            newrank = (1 - d) / npages
            
            #Insert Code Here
            for node in graph:
                if page in graph[node]:
                    newrank = newrank + d*(rank[node] / len(graph[node]))

            newranks[page] = newrank
        ranks = newranks
    return ranks



cache = {
   'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipies:
<ul>
<li> <a >Hummus Recipe</a>
<li> <a s Best Hummus</a>
<li> <a s Hummus Recipe</a>
</ul>

For more expert opinions, check out the 
<a >Nickel Chef</a> 
and <a >Zinc Chef</a>.
</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from 
<a >the Nickel Chef</a>.
</p>
<p>
For great hummus, try 
<a >this recipe</a>.

</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a >
best Hummus recipe!
</a>

</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>

<ol>
<li> Open a can of garbonzo beans.
<li> Crush them in a blender.
<li> Add 3 tablesppons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>

</body>
</html>

""", 
   'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>

<ol>
<li> Kidnap the <a >Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>

</body>
</html>

""", 
   'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>

<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>

</body>
</html>




""", 
}

def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>, [list of pages it links to]
    index = {} 
    while tocrawl: 
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            
            
            graph[page] = outlinks
            
            
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph


def get_page(url):
    if url in cache:
        return cache[url]
    else:
        return None
    
def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links


def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)
        
def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None

index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
ranks = compute_ranks(graph)
print ranks

#>>> {'http://udacity.com/cs101x/urank/kathleen.html': 0.11661866666666663,
#'http://udacity.com/cs101x/urank/zinc.html': 0.038666666666666655,
#'http://udacity.com/cs101x/urank/hummus.html': 0.038666666666666655,
#'http://udacity.com/cs101x/urank/arsenic.html': 0.054133333333333325,
#'http://udacity.com/cs101x/urank/index.html': 0.033333333333333326,
#'http://udacity.com/cs101x/urank/nickel.html': 0.09743999999999997}

圖片發(fā)自簡書App

最后編輯于：2017.11.27 04:47:09

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末意系，一起剝皮案震驚了整個濱河市，隨后出現(xiàn)的幾起案子饺汹，更是在濱河造成了極大的恐慌蛔添，老刑警劉巖，帶你破解...
沈念sama閱讀 218,546評論 6贊 507
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件兜辞，死亡現(xiàn)場離奇詭異迎瞧，居然都是意外死亡，警方通過查閱死者的電腦和手機(jī)逸吵，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 93,224評論 3贊 395
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門凶硅，熙熙樓的掌柜王于貴愁眉苦臉地迎上來，“玉大人扫皱，你說我怎么就攤上這事足绅。” “怎么了韩脑？”我有些...
開封第一講書人閱讀 164,911評論 0贊 354
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵氢妈，是天一觀的道長。經(jīng)常有香客問我段多，道長首量，這世上最難降的妖魔是什么？我笑而不...
開封第一講書人閱讀 58,737評論 1贊 294
?港島之戀（遺憾婚禮）
正文為了忘掉前任进苍，我火速辦了婚禮加缘，結(jié)果婚禮上，老公的妹妹穿的比我還像新娘觉啊。我一直安慰自己生百，他們只是感情好，可當(dāng)我...
茶點(diǎn)故事閱讀 67,753評論 6贊 392
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布柄延。她就那樣靜靜地躺著，像睡著了一般缀程。火紅的嫁衣襯著肌膚如雪搜吧。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 51,598評論 1贊 305
城市分裂傳說
那天杨凑，我揣著相機(jī)與錄音滤奈，去河邊找鬼。笑死撩满，一個胖子當(dāng)著我的面吹牛蜒程，可吹牛的內(nèi)容都是我干的绅你。我是一名探鬼主播，決...
沈念sama閱讀 40,338評論 3贊 418
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼昭躺，長吁一口氣：“原來是場噩夢啊……” “哼忌锯！你這毒婦竟也來了？” 一聲冷哼從身側(cè)響起领炫，我...
開封第一講書人閱讀 39,249評論 0贊 276
萬榮殺人案實錄
序言：老撾萬榮一對情侶失蹤偶垮，失蹤者是張志新（化名）和其女友劉穎，沒想到半個月后帝洪，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體似舵，經(jīng)...
沈念sama閱讀 45,696評論 1贊 314
?護(hù)林員之死
正文獨(dú)居荒郊野嶺守林人離奇死亡，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點(diǎn)故事閱讀 37,888評論 3贊 336
?白月光啟示錄
正文我和宋清朗相戀三年葱峡，在試婚紗的時候發(fā)現(xiàn)自己被綠了砚哗。大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片。...
茶點(diǎn)故事閱讀 40,013評論 1贊 348
活死人
序言：一個原本活蹦亂跳的男人離奇死亡砰奕，死狀恐怖蛛芥，靈堂內(nèi)的尸體忽然破棺而出，到底是詐尸還是另有隱情脆淹，我是刑警寧澤常空，帶...
沈念sama閱讀 35,731評論 5贊 346
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布，位于F島的核電站盖溺，受9級特大地震影響漓糙，放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜烘嘱，卻給世界環(huán)境...
茶點(diǎn)故事閱讀 41,348評論 3贊 330
男人毒藥：我在死后第九天來索命
文/蒙蒙一昆禽、第九天我趴在偏房一處隱蔽的房頂上張望。院中可真熱鬧蝇庭，春花似錦醉鳖、人聲如沸。這莊子的主人今日做“春日...
開封第一講書人閱讀 31,929評論 0贊 22
一樁弒父案盗棵，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽。三九已至北发，卻和暖如春纹因，著一層夾襖步出監(jiān)牢的瞬間，已是汗流浹背琳拨。一陣腳步聲響...
開封第一講書人閱讀 33,048評論 1贊 270
情欲美人皮
我被黑心中介騙來泰國打工瞭恰，沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留，地道東北人狱庇。一個月前我還...
沈念sama閱讀 48,203評論 3贊 370
代替公主和親
正文我出身青樓惊畏，卻偏偏與公主長得像恶耽，于是被迫代替她去往敵國和親。傳聞我的和親對象是個殘疾皇子颜启，可洞房花燭夜當(dāng)晚...
茶點(diǎn)故事閱讀 44,960評論 2贊 355