阿里電話面試問題----100萬個URL如何找到出現頻率最高的前100個？

阿新 • • 發佈：2019-01-26

內推阿里電話面試中面試官給我出的一個題：

我想的頭一個解決方案，就是放到stl 的map裡面對出現的頻率作為pair的第二個欄位進行排序，之後按照排序結果返回：

下面口說無憑，show your code，當然在討論帖子中遭遇了工程界大牛的sql程式碼在技術上的碾壓。什麼是做工程的，什麼是工程師的思維，不要一味的埋頭搞演算法。

討論帖：

python 抓取百度搜索結果的討論貼：

實驗資料，python從百度抓得：

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""


import urllib2 
import re 
import os

#connect to a URL 
#一頁的搜尋結果中url大概是200個左右
file_url = open('url.txt','ab+')
#搜尋框裡的東西,這塊可以設定成數字好讓每次搜尋的結果不一樣
search = '123'
url = "http://www.baidu.com/s?wd="+search


def setUrlToFile():
    website = urllib2.urlopen(url) 
    #read html code 

    html = website.read() 

    #use re.findall to get all the links 

    links = re.findall('"((http|ftp)s?://.*?)"', html)
 

    for s in links:
        print s[0]
        if len(s[0]) < 256:
            file_url.write(s[0]+'\r\n')
    
#收集實驗資料
for i in range(0,50):
    setUrlToFile()

file_url.close()


###需要重新開啟再讀一下
file_url = open('url.txt','r')
file_lines = len(file_url.readlines())
print "there are %d url in %s" %(file_lines,file_url)
file_url.close()

方法1：

c++ 寫的讀 url.txt放到map裡面

對map<string , int>的value進行排序，得到前100個

執行一下也就55s，還是很快的，url長度進行了限制小於256個字元

#pragma once
/*
//計算程式碼段執行時間的類
//
*/
#include <iostream>

#ifndef ComputeTime_h
#define ComputeTime_h


//單位毫秒

class   ComputeTime    
{  
private:  
	int Initialized;  
	__int64 Frequency;  
	__int64 BeginTime;  
		    
public:  

	bool Avaliable();  
	double End();  
	bool Begin();  
	ComputeTime();  
	virtual   ~ComputeTime();    

};  






#endif
#include "stdafx.h"
#include "ComputeTime.h"
#include <iostream>
#include <Windows.h>

ComputeTime::ComputeTime()  
{  
	Initialized=QueryPerformanceFrequency((LARGE_INTEGER   *)&Frequency);  
}  
   
 ComputeTime::~ComputeTime()  
{  
		    
}  
   
 bool   ComputeTime::Begin()  
{  
	if(!Initialized)  
		return 0;

	 return   QueryPerformanceCounter((LARGE_INTEGER   *)&BeginTime);  
 }
     
 double   ComputeTime::End()
{  
	 if(!Initialized)  
		return 0;

		   
	 __int64   endtime;  
		   
	 QueryPerformanceCounter((LARGE_INTEGER   *)&endtime);  
		    
		  
	 __int64   elapsed = endtime-BeginTime;  
		    
		  
	 return   ((double)elapsed/(double)Frequency)*1000.0;  //單位毫秒
 }  

 bool   ComputeTime::Avaliable()
{  
	 return Initialized;  
}   


// sortUrl.cpp : 定義控制檯應用程式的入口點。
//

#include "stdafx.h"
//#include <utility>    
#include <vector>
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <algorithm>
#include "ComputeTime.h"

using namespace std;

map<string,int> urlfrequency;


typedef pair<string, int> PAIR;


struct CmpByValue 
{
	bool operator()(const PAIR& lhs, const PAIR& rhs) 
	{
		return lhs.second > rhs.second;
	}
};

void find_largeTH(map<string,int> urlfrequency)
{
	//把map中元素轉存到vector中 ,按照value排序
	vector<PAIR> url_quency_vec(urlfrequency.begin(), urlfrequency.end());
	sort(url_quency_vec.begin(), url_quency_vec.end(), CmpByValue());
	//url_quency_vec.size()
	for (int i = 0; i != 100; ++i) 
	{
		cout<<url_quency_vec[i].first<<endl;
		cout<<url_quency_vec[i].second<<endl;
	}
}


//urlheap的建立過程，URL插入時候存在的
void insertUrl(string url)
{
	pair<map<string ,int>::iterator, bool> Insert_Pair;
	Insert_Pair = urlfrequency.insert(map<string, int>::value_type(url,1));



	if (Insert_Pair.second == false)
	{
		(Insert_Pair.first->second++);
	}
	

}


int _tmain(int argc, _TCHAR* argv[])
{
	fstream URLfile;
	char buffer[1024]; 
	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())  
	{ cout << "Error opening file"; exit (1); } 
	else
	{
	cout<<"open file success!"<<endl;
	}

	ComputeTime cp;
	cp.Begin();
	int i = 0;
	 while (!URLfile.eof())  
	{  
	URLfile.getline (buffer,1024);  
	//cout << buffer << endl;  
	string temp(buffer);
	//cout<<i++<<endl;
	insertUrl(temp);
	}  
	      


	find_largeTH(urlfrequency);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();
	//system("pause");
	return 0;
}

實驗結果：55s還不算太差，可以接受，畢竟是頭腦中的第一個解決方案。

方法2：

hash code 版本，只是不知道怎麼 hash和url關聯起來：

// urlFind.cpp : 定義控制檯應用程式的入口點。
//

// sortUrl.cpp : 定義控制檯應用程式的入口點。
//

#include "stdafx.h"
 
#include <vector>
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <algorithm>
#include <unordered_map>
#include "ComputeTime.h"

using namespace std;

map<unsigned int,int> urlhash;


typedef pair<unsigned int, int> PAIR;


struct info{
	string url;
	int cnt;
	bool operator<(const info &r) const {
		return cnt>r.cnt;
	}
};


unordered_map<string,int> count;

//priority_queue<info> pq;


struct CmpByValue 
{
	bool operator()(const PAIR& lhs, const PAIR& rhs) 
	{
		return lhs.second > rhs.second;
	}
};

void find_largeTH(map<unsigned int,int> urlhash)
{
	//把map中元素轉存到vector中 ,按照value排序
	vector<PAIR> url_quency_vec(urlhash.begin(), urlhash.end());
	sort(url_quency_vec.begin(), url_quency_vec.end(), CmpByValue());
	//url_quency_vec.size()
	for (int i = 0; i != 100; ++i) 
	{
		cout<<url_quency_vec[i].first<<endl;
		cout<<url_quency_vec[i].second<<endl;
	}
}


// BKDR Hash Function
unsigned int BKDRHash(char *str)
{
	unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
	unsigned int hash = 0;

	while (*str)
	{
		hash = hash * seed + (*str++);
	}

	return (hash & 0x7FFFFFFF);
}

//
void insertUrl(string url)
{

	unsigned int hashvalue = BKDRHash((char *)url.c_str());
	pair<map<unsigned int ,int>::iterator, bool> Insert_Pair;
	Insert_Pair = urlhash.insert(map<unsigned int, int>::value_type(hashvalue,1));

	if (Insert_Pair.second == false)
	{
		(Insert_Pair.first->second++);
	}


}


int _tmain(int argc, _TCHAR* argv[])
{
	fstream URLfile;
	char buffer[1024]; 
	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())  
	{ cout << "Error opening file"; exit (1); } 
	else
	{
		cout<<"open file success!"<<endl;
	}

	ComputeTime cp;
	cp.Begin();
	int i = 0;
	while (!URLfile.eof())  
	{  
		URLfile.getline (buffer,1024);  
		//cout << buffer << endl;  
		string temp(buffer);
		//cout<<i++<<endl;
		insertUrl(temp);
	}  



	find_largeTH(urlhash);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();
	//system("pause");
	return 0;
}

效能15秒左右：缺點在於沒有把hashcode和url進行關聯，技術的處理速度已經非常可觀了

方法3：

下面用STL的hash容器unordered_map，和優先佇列(就是堆)來實現這個問題。

// urlFind.cpp : 定義控制檯應用程式的入口點。
//

// sortUrl.cpp : 定義控制檯應用程式的入口點。
//

#include "stdafx.h"
 
#include <vector>
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <algorithm>
#include <unordered_map>
#include <queue>
#include "ComputeTime.h"

using namespace std;


typedef pair<string, int> PAIR;


struct info
{
	string url;
	int cnt;
	bool operator<(const info &r) const
	{
		return cnt<r.cnt;
	}
};


unordered_map<string,int> hash_url;

priority_queue<info> pq;



void find_largeTH(unordered_map<string,int> urlhash)
{

	unordered_map<string,int>::iterator iter = urlhash.begin();
	info temp;
	for (; iter!= urlhash.end();++iter)
	{
		temp.url = iter->first;
		temp.cnt = iter->second;
		pq.push(temp);
	}

	for (int i = 0; i != 100; ++i) 
	{

		cout<<pq.top().url<<endl;
		cout<<pq.top().cnt<<endl;
		pq.pop();
	}
}



void insertUrl(string url)
{

	pair<unordered_map<string ,int>::iterator, bool> Insert_Pair;
	Insert_Pair = hash_url.insert(unordered_map<string, int>::value_type(url,1));

	if (Insert_Pair.second == false)
	{
		(Insert_Pair.first->second++);
	}

}

int _tmain(int argc, _TCHAR* argv[])
{
	fstream URLfile;
	char buffer[1024]; 
	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())  
	{ cout << "Error opening file"; exit (1); } 
	else
	{
		cout<<"open file success!"<<endl;
	}

	ComputeTime cp;
	cp.Begin();
	int i = 0;
	while (!URLfile.eof())  
	{  
		URLfile.getline (buffer,1024);  
		//cout << buffer << endl;  
		string temp(buffer);
		//cout<<i++<<endl;
		insertUrl(temp);
	}  

	find_largeTH(hash_url);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();
	//system("pause");
	return 0;
}

基本上算是演算法裡面比較優秀的解決方案了，面試官如果能聽到這個方案應該會比較欣喜。

方法4：實驗耗時未知，技術上碾壓了上述解決方案，中高年輕人，不要重複造輪子！哈哈

資料庫，SQL語句：

load data infile "d:/bigdata.txt" into table tb_url(url);

SELECT
	url,
	count(url) as show_count
	FROM
	tb_url
	GROUP BY url
	ORDER BY show_count desc
	LIMIT 100

給定一個file，查找出裡面出現頻率最高的10個單詞

之前已經總結了給定一組數字，如何線上性時間內找到第k小的數字。這兩個問題看似有十分subtle的關係。很顯然這裡是找最大的前K個單詞。單詞相當於衛星資料，直接對單詞的鍵值，即頻率排序啦。現在我們對這個求top K frequent words做一個小小的總結

怎樣從10億查詢詞找出出現頻率最高的10個

1．問題描述在大規模資料處理中，常遇到的一類問題是，在海量資料中找出出現頻率最高的前K個數，或者從海量資料中找出最大的前K個數，這類問題通常稱為“top K”問題，如：在搜尋引擎中，統計搜尋最熱門的10個查詢詞；在歌曲庫中統計下載率最高的前10首歌等等。 2．當

leetcode_347. Top K Frequent Elements 找出現頻率最高的前k個元素

題目： Given a non-empty array of integers, return the k most frequent elements. For example, Given [1,1,1,2,2,3] and k = 2, return [1,2]

LeetCode347 出現頻率最高的K個元素

Given a non-empty array of integers, return the k most frequent elements. Example 1: Input: nums = [1,1,1,2,2,3], k = 2 Output: [1,2] Example

阿里電話面試問題----100萬個URL如何找到出現頻率最高的前100個？

內推阿里電話面試中面試官給我出的一個題：我想的頭一個解決方案，就是放到stl 的map裡面對出現的頻率作為pair的第二個欄位進行排序，之後按照排序結果返回：下面口說無憑，show your code，當然在討論帖子中遭遇了工程界大牛的sql程式碼在技術上的碾壓。什麼是

從100萬個數中找出最大的前100個數

1.演算法如下：根據快速排序劃分的思想 (1) 遞迴對所有資料分成[a,b）b（b,d]兩個區間，(b,d]區間內的數都是大於[a,b)區間內的數 (2) 對(b,d]重複(1)操作，直到最右邊的區間個數小於100個。注意[a,b)區間不用劃分 (3) 返回上

記一次阿里電話面試

晚上九點跟朋友在看電影，正看到一半的時候接到來自杭州阿里巴巴的固定電話，心裡一驚，呀，好突然的電話面試啊，然後在毫無準備的情況下開始了這次的面試。接通電話一個聽上去很和藹的聲音開始自我介紹然後問我是否方便進行面試，在聽說我正在看電影之後說再約一個時間進行面試，我想了一下，本想

一年Java的阿里電話面試全紀錄

在北京待了快兩年了，之前一直在一家大型國企實習家轉正，2017年畢業到現在也已經畢業快一年了，幾位一起在京的同學想回南方發展（本人也是南方人），所以自己也下了這個決心，月初遞交了辭呈。經過考慮之後選擇了杭州，既然選擇了杭州阿里巴巴自然是一座要抱著敬畏之心去征服的大山。首先非常

阿里電話面試總結

電話面試開始，叫我做了自我介紹，這個沒有準備，只簡單的說了幾句，接著就問我技術方面的問題了，總結如下： 1.兩個演算法問題，第一個是有介於1-101的101個隨機數，裡面有兩個相同的數，怎麼找到這個數。我首先說先用排序演算法進行排序，然後進行比較。面試官又問了此演算

二叉搜尋樹的最小節點絕對值之差/在二叉查詢樹中尋找兩個節點，使它們的和為一個給定值/找出 BST 中的所有眾數（出現頻率最高的元素）。

關於二叉樹的數值運算，一般考慮借用中序遍歷為陣列；再進行計算的思想。 /** * Definition for a binary tree node. * public class TreeNode { * int val; * TreeNode left; *

統計一段文章的單詞頻率，取出頻率最高的5個單詞和個數(python)

練習題：統計一段英語文章的單詞頻率，取出頻率最高的5個單詞和個數(用python實現)怎麼判定單詞?1 不是字母的特殊字元作為分隔符分割字串（避免特殊字元的處理不便，全部替換成'-')2 遍歷字串，取每個word3 正則匹配怎麼統計個數？將wordlist的word和word的個數放入dict，排序

Python 中找出字串中出現頻率最高的字母

發現一個學Python的好網站 https://py.checkio.org 第一題大概意思就是找出一個字串中出現頻率最高字母我的思路也是直接，弄個字典，遍歷字串，將鍵值對填進字典裡，健就是字母，值就是出現了幾次，再查下字典裡最大的值即可。上我的程式碼 import

LeetCode:347. Top K Frequent Elements(找出出現頻率最高的K個數)

Given a non-empty array of integers, return the k most frequent elements. Example 1: Input: nums = [1,1,1,2,2,3], k = 2 Output: [1,2]

面試題：找出無序陣列中出現頻率最高的元素

解決這道題的思路有很多比如： 1.給陣列排序變成有序陣列，然後找到重複次數最多的元素； 2.用HashMap儲存陣列元素，優先佇列存取陣列元素出現的次數，找出現次數最多的元素輸出； 3.記錄元素出現的次數及對應的值，迴圈不斷更新最大次數和對應的值，最後儲存

在一個字串中個找到只出現一次的第一個字元

1. 看到題目，我首先想到的第一種方法是：簡單粗暴的搜尋，從頭到尾遍歷每個字元，看它的前後是不是有與之相等的字元（不包括本身），如果有那麼就繼續下一個字元，如果沒有就輸出該字元（這裡需要注意的是，有的人寫出來的演算法，沒有考慮到最後一個字元

通過命令查詢檔案中出現頻率最高的前K個詞

　使用linux命令或者shell實現：檔案words存放英文單詞，格式為每行一個英文單詞（單詞可以重複），統計這個檔案中出現次數最多的前10個單詞。 cat words.txt | sort | uniq -c | sort -k1,1nr | hea

統計一TXT文件中單詞出現頻率，輸出頻率最高的10個單詞

實驗過程主要思路就是首先將標點符號，常用冠詞等替換掉，然後利用雜湊表和陣列原理排序，輸出最高頻率的前十個陣列程式碼如下 import java.io.BufferedReader; import java.io.File; import java.io.Fil

13、出現頻率最高的前K個元素

（個人水平有限，請見諒！）描述：有一個不為空且僅包含正整數的陣列，找出其中出現頻率最高的前 K 個數，時間複雜度必須在 O(n log n) 以內。輸入：一行資料包括兩部分，一個正整數陣列（數字間 ‘,’ 分隔）和一個正整數 K （1 ≤ K

算法試題 - 找出一個序列中出現頻率最高的三個數

ons .... span import class for 方法 dict let 題目找出一個序列中出現頻率最高的三個數解析思路一創建一個新字典, k 為序列的值, 然後 v 的初始值 0, 然後循環序列進行計數, 然後進行新字典的處理.....

面試- 阿里-. 大資料題目- 給定a、b兩個檔案，各存放50億個url，每個url各佔64位元組，記憶體限制是4G，讓你找出a、b檔案共同的url?

假如每個url大小為10bytes，那麼可以估計每個檔案的大小為50G×64=320G，遠遠大於記憶體限制的4G，所以不可能將其完全載入到記憶體中處理，可以採用分治的思想來解決。　　Step1：遍歷檔案a，對每個url求取hash(url)%1000，然後根據所取得的

阿里電話面試問題----100萬個URL如何找到出現頻率最高的前100個？

相關推薦