python find pid by listen port

Codes

import sys
import subprocess
import shlex
import re

def find_pid_by_listen_port(port):
    if sys.platform == 'win32':
        output = subprocess.check_output('netstat -a -n -o', universal_newlines=True)
        match = re.search('.*:{0} +.* +.+ +[0-9]+'.format(port),output)
        if match:
            return shlex.split(match.group(0))[-1]
        else:
            return None
    else:
        raise Exception('not support platform ' + sys.platform)
    
pid = find_pid_by_listen_port(9160)
print(pid)

python download Cassandra and start up in Windows

Description

This program will kill existing Cassandra before deploy new one.

Codes

import os
import shutil
import sys
import math
import urllib.request
import subprocess
import shlex
import signal
import time
import re
from datetime import date

def http_download(download_url, download_file):
    with urllib.request.urlopen(download_url) as f:
        with open(download_file,'wb') as target:
            filesize = int(f.getheader('Content-Length'))
            wrotesize = 0
            while True:
                if wrotesize == int(filesize):
                    break
                wrotesize += target.write(f.read(1024))
                download_percent = math.ceil((wrotesize/filesize)*100)
                print('\rDownload {0} to {1} ... {2}%'.format(download_url,download_file,download_percent),end='')
    print()

def find_pid_by_listen_port(port):
    if sys.platform == 'win32':
        output = subprocess.check_output('netstat -a -n -o', universal_newlines=True)
        match = re.search('.*:{0} +.* +.+ +[0-9]+'.format(port),output)
        if match:
            return shlex.split(match.group(0))[-1]
        else:
            return None
    else:
        raise Exception('not support platform ' + sys.platform)
    
    
download_url = 'http://ftp.tc.edu.tw/pub/Apache/cassandra/2.0.4/apache-cassandra-2.0.4-bin.tar.gz'
download_file = 'd:/apache-cassandra-2.0.4-bin.tar.gz'
unzip_folder = 'd:/deploy/work/{0}/apache-cassandra-2.0.4-bin'.format(date.today().isoformat())
execute_folder = unzip_folder + '/apache-cassandra-2.0.4'
execute_path = execute_folder + '/bin/cassandra.bat'
executable = 'start cmd /c ' + execute_path
executable_argv = shlex.split(executable)
pid = find_pid_by_listen_port(9160)
if pid:
    print('kill process id',pid)
    os.kill(int(pid),signal.SIGTERM)
if os.path.exists('d:/deploy'):
    print('Delete d:/deploy')
    shutil.rmtree('d:/deploy')
http_download(download_url, download_file)
os.makedirs(unzip_folder)
print('unzip {0} to {1}'.format(download_file, unzip_folder))
shutil.unpack_archive(download_file, unzip_folder)
print('Execute',execute_path)
subprocess.Popen(executable_argv,shell=True)
    

python download text percent progress

Description

想學一下 Maven download 時候的 text percent progress

Codes

import sys
import math
import urllib.request

def http_download(download_url, download_file):
    with urllib.request.urlopen(download_url) as f:
        with open(download_file,'wb') as target:
            filesize = int(f.getheader('Content-Length'))
            wrotesize = 0
            while True:
                if wrotesize == int(filesize):
                    break
                wrotesize += target.write(f.read(1024))
                download_percent = math.ceil((wrotesize/filesize)*100)
                print('\rDownload {0} to {1} ... {2}%'.format(download_url,download_file,download_percent),end='')

if len(sys.argv) < 3:
    print('Usage: {0} download_url download_file'.format(sys.argv[0]))
else:    
    download_url = sys.argv[1]
    download_file = sys.argv[2]
    http_download(download_url, download_file)

使用方式

c:\workspace_python>python test.py http://ftp.tc.edu.tw/pub/Apache/cassandra/2.0.4/apache-cassandra-2.0.4-bin.tar.gz d:/apache-cassandra-2.0.4-bin.tar.gz

python http download cassandra

Reference

http://www.apache.org/dyn/closer.cgi?path=/cassandra/2.0.4/apache-cassandra-2.0.4-bin.tar.gz
http://docs.python.org/3/library/urllib.request.html#module-urllib.request

Description

用 urllib 下載 cassandra 到本地

Codes

簡單的方式

import urllib.request

with urllib.request.urlopen('http://ftp.tc.edu.tw/pub/Apache/cassandra/2.0.4/apache-cassandra-2.0.4-bin.tar.gz') as f:
    with open('d:/apache-cassandra-2.0.4-bin.tar.gz','wb') as target:
        target.write(f.read())

如果想要看進度可以這樣

import urllib.request

with urllib.request.urlopen('http://ftp.tc.edu.tw/pub/Apache/cassandra/2.0.4/apache-cassandra-2.0.4-bin.tar.gz') as f:
    with open('d:/apache-cassandra-2.0.4-bin.tar.gz','wb') as target:
        filesize = f.getheader('Content-Length')
        wrotesize = 0
        while True:
            if wrotesize == int(filesize):
                break
            wrotesize += target.write(f.read(1024))
            print('download...',wrotesize,'of',filesize)
            

想從公司內的 Maven download 還需要認證

import urllib.request

pwdmgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
pwdmgr.add_password(None,'http://mvn.company.site','myid','mypw')
auth_handler = urllib.request.HTTPBasicAuthHandler(pwdmgr)
opener = urllib.request.build_opener(auth_handler)
urllib.request.install_opener(opener)
with urllib.request.urlopen('http://mvn.company.site/downloadfile.zip') as f:
    with open('d:/downloadfile.zip','wb') as target:
        target.write(f.read())

後記

還不太清楚 HTTPBasicAuthHandler 的 realm 要怎麼指定正確, 
使用 Maven response 的 realm 也不行.
只好先用 HTTPPasswordMgrWithDefaultRealm 了...

python compress/decompress file

Reference

http://docs.python.org/3/library/shutil.html#module-shutil

Description

把 d:/nginx-1.4.3 壓縮到 d:/ziptarget/test.zip
再把 d:/ziptarget/test.zip 解壓縮到 d:/unziptarget/

Codes (適合 python 3.2 以上)

import shutil

unzipfile_folder = 'd:/unziptarget'
archive_name = 'd:/ziptarget/test'
root_dir = 'd:/nginx-1.4.3'
archive_path = shutil.make_archive(archive_name, 'zip', root_dir)
shutil.unpack_archive(archive_path, unzipfile_folder)

python 的 iterator 與 (神奇的) yield (iterator generator)

Reference

http://docs.python.org/3/tutorial/classes.html#iterators
http://docs.python.org/3/tutorial/classes.html#generators

Iterator

這段程式

for i in range(5):
    print(i)

執行結果為

0
1
2
3
4

可以執行的原因是 range 回傳了一個可以 iterate 的物件,
讓 python 可以呼叫 "next" 取得下一個值.
如果想要自己實作一個能用 for iterate 的物件時,
只需要實作 __iter__ 與 __next__ 這兩個 function,
for 迴圈執行時 python 會自動執行這兩個 function.
我們只要在 __next__ 呼教時維持好物建的狀態即可.
通知 for 迴圈停止的方式是丟出一個 StopIteration 的 error


class NumberIterator:
    def __init__(self, max):
        self.max = max
        self.current = 0
    def __iter__(self):
        print('iter is called. max=',self.max)
        return self
    def __next__(self):
        print('next is called, current=', self.current, ', max=', self.max)
        if ( self.current == self.max ):
            raise StopIteration
        self.current += 1
        return self.current
        
it = NumberIterator(5)
# "for" statement will call iter() of NumberIterator
# so 'iter is called. max=' will be printed
for i in it:
    print(i) #print 1 ~ 5

# there is no "for" statement, so 'iter is called. max=' won't be printed
it = NumberIterator(4)
print(next(it))
print(next(it))
print(next(it))
print(next(it))
print(next(it)) #StopIteration

執行結果

d:\workspace_python>python test.py
iter is called. max= 5
next is called, current= 0 , max= 5
1
next is called, current= 1 , max= 5
2
next is called, current= 2 , max= 5
3
next is called, current= 3 , max= 5
4
next is called, current= 4 , max= 5
5
next is called, current= 5 , max= 5
next is called, current= 0 , max= 4
1
next is called, current= 1 , max= 4
2
next is called, current= 2 , max= 4
3
next is called, current= 3 , max= 4
4
next is called, current= 4 , max= 4
Traceback (most recent call last):
  File "test.py", line 27, in 
    print(next(it)) #StopIteration
  File "test.py", line 11, in __next__
    raise StopIteration
StopIteration

yield (iterator generator)

神奇的 yield, 這個 statement 是出現在 function 裡面, 
讓程式先回傳, 但 function 內的狀態保持不變, 
等下次程式的 __next__ 被呼叫時, 程式會從 yield 之後的碼繼續執行.
def test_yield():
    i = 0
    print('before the first yield',i)
    yield i
    print('after the first yield',i)
    i += 1
    print('before the second yield',i)
    yield i
    print('after the second yield',i)
    i += 1
    for i in range(3):
        print('before yield in for stmt')
        yield i**2
        print('after yield in for stmt')
    
for i in test_yield():
    print(i)

執行結果
d:\workspace_python>python test.py
before the first yield 0
0
after the first yield 0
before the second yield 1
1
after the second yield 1
before yield in for stmt
0
after yield in for stmt
before yield in for stmt
1
after yield in for stmt
before yield in for stmt
4
after yield in for stmt

JDK6 to JDK7 => Basic IO

Reference

讀寫小檔案

如果檔案比較小可以使用 Files 提供的 read/write.
package test;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.List;

public class TestIO {

    public static void main(String[] params) throws IOException {
        Charset cs = Charset.forName("utf-8");
        Path path = Paths.get("d:/", "test.txt");
        List lines = Arrays.asList("line1", "line2", "line3");
        Files.write(path, lines, cs, StandardOpenOption.TRUNCATE_EXISTING);
        List readLines = Files.readAllLines(path, cs);
        System.out.println(readLines);
    }

}

讀寫大檔案

如果是要讀寫大的檔案, 可以用 Files 的 newBufferedReader/newBufferedWriter/newByteChannel
package test;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.UUID;

public class TestIO {

    public static void main(String[] params) throws IOException {
        Charset cs = Charset.forName("utf-8");
        Path path = Paths.get("d:/", "test.txt");
        int wLineCnt = 25000000;
        try (
            BufferedWriter writer = Files.newBufferedWriter(
                      path, cs, StandardOpenOption.TRUNCATE_EXISTING)) {
            for ( int i = 0; i < wLineCnt; i++ ) {
                writer.write(i + ":" + UUID.randomUUID().toString() + "\n");
            }
            writer.flush();            
        }
        try (BufferedReader reader = Files.newBufferedReader(path, cs)) {
            int rLineCnt = 0;
            String line = null;
            while ((line = reader.readLine()) != null) {
                int num = Integer.valueOf(line.substring(0, line.indexOf(":")));
                if ( rLineCnt++ != num ) {
                    throw new RuntimeException(
                                "not equal! " + num + "," + rLineCnt);
                }
            }
            if ( rLineCnt != wLineCnt ) {
                throw new RuntimeException(
                           "not equal! " + wLineCnt + "," + rLineCnt);
            }
        }
        System.out.println("done");
    }

}

取根目錄

package test;

import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.Path;

public class TestIO {

    public static void main(String[] params) throws IOException {
        for (Path path: FileSystems.getDefault().getRootDirectories()) {
            System.out.println(path); //列出 C:/ & D:/ ..etc
        }
    }
    
}


建資料夾

package test;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

public class TestIO {

    public static void main(String[] params) throws IOException {
        Files.createDirectories(Paths.get("d:/test1/test2/test3/test4"));
    }
    
}

Iterate資料夾

JDK 7提供新的 interface 讓你 iterate 資料夾: DirectoryStream.
在 API 上說這個 interface 比較 scalable, 可以用來 iterate 很大的 folder.
上網查有人說就的 File#listFiles 會把所有的 folder 載入會佔用比較多的記憶體.
要省記憶體應該要讓資料夾下有上百萬或上千萬個資料夾或檔案才有感覺吧.
我測一百萬個資料夾也沒甚麼差.
後來覺得關鍵是 API 這句話:
The iterator is weakly consistent. It is thread safe but does not freeze the directory while iterating, so it may (or may not) reflect updates to the directory that occur after the DirectoryStream is created.
在資料夾下檔案或資料夾數量很大時, 原本的 File 仍須要給 API client 精準的資訊.
scale 愈大, "精準"的成本就愈大.
DirectoryStream 選擇一開始就告訴你它不準, 所以贏在起跑點, 可以用較少的資源取資料.
等你真的要處理某個檔案再動作就行了.
我覺得這種 interface 更有利於硬碟在遠端的 case. 硬碟不在本機,
要求 File 物件保證整個資料夾資訊要對太難, 這時候 DirectoryStream 就可以處理很好.
要注意 DirectoryStream 用完要關掉, 可以在 try () 裡面宣告.

package test;

import java.io.File;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

public class TestIO {

    public static void main(String[] params) throws IOException {
        long start = System.currentTimeMillis();
        createFolders();
        System.out.println("spend " + (System.currentTimeMillis() - start) 
                    + " millis to create folders");
        
        start = System.currentTimeMillis();
        System.out.println(listFoldersNIO(Paths.get("d:/test")));
        System.out.println("spend " + (System.currentTimeMillis() - start) 
                    + " millis to list folders (NIO)");

        start = System.currentTimeMillis();
        System.out.println(listFoldersIO(Paths.get("d:/test").toFile()));
        System.out.println("spend " + (System.currentTimeMillis() - start) 
                    + " millis to list folders (IO)");
    }

    private static void createFolders() throws IOException {
        for ( int i = 0; i < 1000000; i++ ) {
            Files.createDirectories(Paths.get("d:/test/test" + i));
            System.out.println(i);
        }
        System.out.println("done");
    }

    private static int listFoldersIO(File folder) {
        int cnt = 0;
        for ( String child: folder.list() ) {
            new File(child);
            cnt++;
        }
        return cnt;
    }
    
    private static int listFoldersNIO(Path path) throws IOException {
        int cnt = 0;
        try ( DirectoryStream dirStream = Files.newDirectoryStream(path) ) {
            for ( Path child: dirStream ) {
                child.toFile();
                cnt++;
            }
        }
        return cnt;
    }
    
}

指定 pattern 找資料夾下的檔名

JDK 7提供新的概念叫 Glob, 蠻好理解的.
不過其中有個 ** 的語法說明 "works like * but crosses directory boundaries" 讓我有錯誤的期望以為只要 Files.newDirectoryStream(p, "**/*.{java,jar}") 就可以列舉全部的檔案, 測試結果不行..
可能要用 Files#walkFileTree. 我還沒看 walkFileTree 有甚麼特殊的地方,
但 API 沒特別好用的話我自己作就好了為什麼要提供 walkFileTree 呢...?
(在下面的段落練習了 walkFileTree, 比起自己寫用 walkFileTree 簡潔多了 XD)

package test;

import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

public class TestIO {

    public static void main(String[] params) throws IOException {
        listAll(Paths.get("d:\\Isaac"), "*.{java,jar}");
    }

    private static void listAll(Path p, String globPattern) throws IOException {
        try (
            DirectoryStream<Path> s = Files.newDirectoryStream(p, globPattern)
            ) {
            for ( Path c: s ) {
                System.out.println(c);
            }
        }
        try (DirectoryStream<Path> s = Files.newDirectoryStream(p)) {
            for ( Path c: s ) {
                if ( Files.isDirectory(c, LinkOption.NOFOLLOW_LINKS) ) {
                    listAll(c, globPattern);
                }
            }
        }
    }
    
}

搜尋 activemq source code 中有關鍵字 "Queue" 的檔案

自己實作 DirectoryStream.Filter 的時候不能方便的使用 glob 有點可惜..

package test;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;

public class TestIO {

    public static void main(String[] params) throws IOException {
        listAll(Paths.get("d:\\activemq-parent-5.8.0-source-release"), "Queue");
    }

    private static void listAll(Path p, final String containTxt) 
    throws IOException {
        DirectoryStream.Filter<Path> f = new DirectoryStream.Filter<Path>() {
            @Override
            public boolean accept(Path entry) throws IOException {
                if ( Files.isDirectory(entry, LinkOption.NOFOLLOW_LINKS) 
                               || !entry.toString().endsWith(".java") ) {
                    return false;
                }
                try (BufferedReader reader = Files.newBufferedReader(
                                      entry, Charset.forName("UTF8"))) {
                    String line = null;
                    while ( (line = reader.readLine()) != null ) {
                        if ( line.contains(containTxt) ) {
                            return true;
                        }
                    }
                }
                return false;
            }
        };
        try (DirectoryStream<Path> s = Files.newDirectoryStream(p, f)) {
            for ( Path c: s ) {
                System.out.println("match " + c);
            }
        }
        try (DirectoryStream<Path> s = Files.newDirectoryStream(p)) {
            for ( Path c: s ) {
                if ( Files.isDirectory(c, LinkOption.NOFOLLOW_LINKS) ) {
                    listAll(c, containTxt);
                }
            }
        }
    }
    
}

用 Files#walkFileTree + FileVisitor 列出所有資料夾下的 java 與 jar 檔

package test;

import java.io.IOException;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;

public class TestIO {

    public static void main(String[] params) throws IOException {
        Path p = Paths.get("d:/Isaac");
        FileVisitor<Path> f = new SimpleFileVisitor<Path>() {
            @Override
            public FileVisitResult visitFile(Path file,BasicFileAttributes attrs)                throws IOException {
                String filepath = file.toString();
                if ( filepath.endsWith(".java") || filepath.endsWith(".jar") ) {
                    System.out.println(filepath);
                }
                return super.visitFile(file, attrs);
            }
        };
        Files.walkFileTree(p, f);
    }

}

搜尋 activemq source code 中有關鍵字 "Queue" 的檔案 (用 glob)

前面才提到 "自己實作 FileVisitor 的話不方便用 glob 很可惜" 而已, 就發現其實有提供. 那就是 PathMatcher.
package test;

import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;

public class TestIO {

    public static void main(String[] params) throws Throwable {
        final PathMatcher matcher = 
                   FileSystems.getDefault().getPathMatcher("glob:*.{java,jar}");
        Files.walkFileTree(Paths.get("d:/Isaac"), new SimpleFileVisitor<Path>(){
            @Override
            public FileVisitResult visitFile(Path file,
                    BasicFileAttributes attrs) throws IOException {
                if ( matcher.matches(file.getFileName()) ) {
                    System.out.println("match " + file);
                }
                return super.visitFile(file, attrs);
            }
        });
    }

}

監聽一個 path 是否備新增/刪除/修改

package test;

import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardWatchEventKinds;
import java.nio.file.WatchEvent;
import java.nio.file.WatchKey;
import java.nio.file.WatchService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

public class TestIO {

    public static void main(String[] params) throws Throwable {
        ExecutorService e = Executors.newSingleThreadExecutor();
        Path path = Paths.get("d:/Isaac");
        try (final WatchService w = FileSystems.getDefault().newWatchService()) {
            path.register(w, StandardWatchEventKinds.ENTRY_CREATE, 
                             StandardWatchEventKinds.ENTRY_DELETE, 
                             StandardWatchEventKinds.ENTRY_MODIFY);
            e.execute(new Runnable(){
                @Override
                public void run() {
                    while (true) {
                        try {
                            WatchKey k = w.take();
                            for ( WatchEvent<?> e: k.pollEvents() ) {
                                @SuppressWarnings("unchecked")
                                WatchEvent<Path> pathEvent = (WatchEvent<Path>)e;
                                Path path = pathEvent.context();
                                System.out.println(pathEvent.kind() + ":"  path);
                                if (!k.reset()) {
                                    break;
                                }
                            }
                        } catch (Throwable ex) {
                            ex.printStackTrace();
                        }                    
                    }
                }});
            TimeUnit.DAYS.sleep(1);
        }   
    }
}


別名演算法 Alias Method

 題目 每個伺服器支援不同的 TPM (transaction per minute) 當 request 來的時候, 系統需要馬上根據 TPM 的能力隨機找到一個適合的 server. 雖然稱為 "隨機", 但還是需要有 TPM 作為權重. 解法 別名演算法...